SAMPLE TEST OF THE PROCESS¶

In [5]:
import os
import json
import folium
import pandas as pd
import numpy as np
from zipfile import ZipFile
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from IPython.display import display, HTML, Video
%matplotlib inline
# %matplotlib notebook
pd.set_option("float_format", '{:0.2f}'.format)
pd.set_option('display.max_columns', 30)
In [6]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
In [7]:
if IN_COLAB:
    datasets_folder = './pvs-passive-vehicular-sensors-datasets/' 
else:
    datasets_folder = '../input/pvs-passive-vehicular-sensors-datasets/' 
In [8]:
kaggle_json = {"username":"","key":""}
In [9]:
with open('./kaggle.json', 'w') as f:
    json.dump(kaggle_json, f)
In [10]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
The syntax of the command is incorrect.
'cp' is not recognized as an internal or external command,
operable program or batch file.
In [11]:
import sys
!{sys.executable} -m pip install --upgrade kaggle
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: kaggle in c:\users\sn161663\appdata\roaming\python\python313\site-packages (1.7.4.5)
Requirement already satisfied: bleach in c:\programdata\anaconda3\lib\site-packages (from kaggle) (6.2.0)
Requirement already satisfied: certifi>=14.05.14 in c:\programdata\anaconda3\lib\site-packages (from kaggle) (2025.4.26)
Requirement already satisfied: charset-normalizer in c:\programdata\anaconda3\lib\site-packages (from kaggle) (3.3.2)
Requirement already satisfied: idna in c:\programdata\anaconda3\lib\site-packages (from kaggle) (3.7)
Requirement already satisfied: protobuf in c:\programdata\anaconda3\lib\site-packages (from kaggle) (5.29.3)
Requirement already satisfied: python-dateutil>=2.5.3 in c:\programdata\anaconda3\lib\site-packages (from kaggle) (2.9.0.post0)
Requirement already satisfied: python-slugify in c:\programdata\anaconda3\lib\site-packages (from kaggle) (5.0.2)
Requirement already satisfied: requests in c:\programdata\anaconda3\lib\site-packages (from kaggle) (2.32.3)
Requirement already satisfied: setuptools>=21.0.0 in c:\programdata\anaconda3\lib\site-packages (from kaggle) (72.1.0)
Requirement already satisfied: six>=1.10 in c:\programdata\anaconda3\lib\site-packages (from kaggle) (1.17.0)
Requirement already satisfied: text-unidecode in c:\programdata\anaconda3\lib\site-packages (from kaggle) (1.3)
Requirement already satisfied: tqdm in c:\programdata\anaconda3\lib\site-packages (from kaggle) (4.67.1)
Requirement already satisfied: urllib3>=1.15.1 in c:\programdata\anaconda3\lib\site-packages (from kaggle) (2.3.0)
Requirement already satisfied: webencodings in c:\programdata\anaconda3\lib\site-packages (from kaggle) (0.5.1)
Requirement already satisfied: colorama in c:\programdata\anaconda3\lib\site-packages (from tqdm->kaggle) (0.4.6)
In [12]:
import pathlib, shutil, os

src = pathlib.Path("kaggle.json")            
assert src.exists(), "Place kaggle.json next to this notebook (or fix the path)."

kdir = pathlib.Path.home() / ".kaggle"
kdir.mkdir(exist_ok=True)                    

dst = kdir / "kaggle.json"
shutil.copyfile(src, dst)

try:
    os.chmod(dst, 0o600)
except Exception as e:
    print("chmod skipped:", e)

print("✓ kaggle.json installed at:", dst)
✓ kaggle.json installed at: C:\Users\sn161663\.kaggle\kaggle.json
In [13]:
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi(); api.authenticate()
print("✅ Kaggle authentication OK")
✅ Kaggle authentication OK
In [14]:
import os
os.environ["KAGGLE_USERNAME"] = "<your_kaggle_username>"
os.environ["KAGGLE_KEY"]      = "<your_kaggle_api_key>"

from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi(); api.authenticate()
print("✅ Kaggle authentication OK via env vars")
✅ Kaggle authentication OK via env vars
In [15]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
The syntax of the command is incorrect.
'cp' is not recognized as an internal or external command,
operable program or batch file.
In [16]:
import pathlib, glob
DATA_DIR = pathlib.Path.cwd() / "PVS_dataset"
DATA_DIR.mkdir(exist_ok=True)

api.dataset_download_files(
    dataset="jefmenegazzo/pvs-passive-vehicular-sensors-datasets",
    path=str(DATA_DIR),
    unzip=False
)

zip_paths = glob.glob(str(DATA_DIR / "*.zip"))
print(zip_paths or "No ZIP found yet.")
Dataset URL: https://www.kaggle.com/datasets/jefmenegazzo/pvs-passive-vehicular-sensors-datasets
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[16], line 5
      2 DATA_DIR = pathlib.Path.cwd() / "PVS_dataset"
      3 DATA_DIR.mkdir(exist_ok=True)
----> 5 api.dataset_download_files(
      6     dataset="jefmenegazzo/pvs-passive-vehicular-sensors-datasets",
      7     path=str(DATA_DIR),
      8     unzip=False
      9 )
     11 zip_paths = glob.glob(str(DATA_DIR / "*.zip"))
     12 print(zip_paths or "No ZIP found yet.")

File ~\AppData\Roaming\Python\Python313\site-packages\kaggle\api\kaggle_api_extended.py:1664, in KaggleApi.dataset_download_files(self, dataset, path, force, quiet, unzip, licenses)
   1662 request.dataset_slug = dataset_slug
   1663 request.dataset_version_number = dataset_version_number
-> 1664 response = kaggle.datasets.dataset_api_client.download_dataset(request)
   1666 outfile = os.path.join(effective_path, dataset_slug + '.zip')
   1667 if force or self.download_needed(response, outfile, quiet):

File ~\AppData\Roaming\Python\Python313\site-packages\kagglesdk\datasets\services\dataset_api_service.py:80, in DatasetApiClient.download_dataset(self, request)
     77 if request is None:
     78   request = ApiDownloadDatasetRequest()
---> 80 return self._client.call("datasets.DatasetApiService", "ApiDownloadDataset", request, HttpRedirect)

File ~\AppData\Roaming\Python\Python313\site-packages\kagglesdk\kaggle_http_client.py:124, in KaggleHttpClient.call(self, service_name, request_name, request, response_type)
    122 # Merge environment settings into session
    123 settings = self._session.merge_environment_settings(http_request.url, {}, None, None, None)
--> 124 http_response = self._session.send(http_request, **settings)
    126 response = self._prepare_response(response_type, http_response)
    127 return response

File C:\ProgramData\anaconda3\Lib\site-packages\requests\sessions.py:724, in Session.send(self, request, **kwargs)
    721 if allow_redirects:
    722     # Redirect resolving generator.
    723     gen = self.resolve_redirects(r, request, **kwargs)
--> 724     history = [resp for resp in gen]
    725 else:
    726     history = []

File C:\ProgramData\anaconda3\Lib\site-packages\requests\sessions.py:265, in SessionRedirectMixin.resolve_redirects(self, resp, req, stream, timeout, verify, cert, proxies, yield_requests, **adapter_kwargs)
    263     yield req
    264 else:
--> 265     resp = self.send(
    266         req,
    267         stream=stream,
    268         timeout=timeout,
    269         verify=verify,
    270         cert=cert,
    271         proxies=proxies,
    272         allow_redirects=False,
    273         **adapter_kwargs,
    274     )
    276     extract_cookies_to_jar(self.cookies, prepared_request, resp.raw)
    278     # extract redirect url, if any, for the next loop

File C:\ProgramData\anaconda3\Lib\site-packages\requests\sessions.py:746, in Session.send(self, request, **kwargs)
    743         pass
    745 if not stream:
--> 746     r.content
    748 return r

File C:\ProgramData\anaconda3\Lib\site-packages\requests\models.py:902, in Response.content(self)
    900         self._content = None
    901     else:
--> 902         self._content = b"".join(self.iter_content(CONTENT_CHUNK_SIZE)) or b""
    904 self._content_consumed = True
    905 # don't need to release the connection; that's been handled by urllib3
    906 # since we exhausted the data.

File C:\ProgramData\anaconda3\Lib\site-packages\requests\models.py:820, in Response.iter_content.<locals>.generate()
    818 if hasattr(self.raw, "stream"):
    819     try:
--> 820         yield from self.raw.stream(chunk_size, decode_content=True)
    821     except ProtocolError as e:
    822         raise ChunkedEncodingError(e)

File C:\ProgramData\anaconda3\Lib\site-packages\urllib3\response.py:1066, in HTTPResponse.stream(self, amt, decode_content)
   1064 else:
   1065     while not is_fp_closed(self._fp) or len(self._decoded_buffer) > 0:
-> 1066         data = self.read(amt=amt, decode_content=decode_content)
   1068         if data:
   1069             yield data

File C:\ProgramData\anaconda3\Lib\site-packages\urllib3\response.py:955, in HTTPResponse.read(self, amt, decode_content, cache_content)
    952     if len(self._decoded_buffer) >= amt:
    953         return self._decoded_buffer.get(amt)
--> 955 data = self._raw_read(amt)
    957 flush_decoder = amt is None or (amt != 0 and not data)
    959 if not data and len(self._decoded_buffer) == 0:

File C:\ProgramData\anaconda3\Lib\site-packages\urllib3\response.py:879, in HTTPResponse._raw_read(self, amt, read1)
    876 fp_closed = getattr(self._fp, "closed", False)
    878 with self._error_catcher():
--> 879     data = self._fp_read(amt, read1=read1) if not fp_closed else b""
    880     if amt is not None and amt != 0 and not data:
    881         # Platform-specific: Buggy versions of Python.
    882         # Close the connection when no data is returned
   (...)
    887         # not properly close the connection in all cases. There is
    888         # no harm in redundantly calling close.
    889         self._fp.close()

File C:\ProgramData\anaconda3\Lib\site-packages\urllib3\response.py:862, in HTTPResponse._fp_read(self, amt, read1)
    859     return self._fp.read1(amt) if amt is not None else self._fp.read1()
    860 else:
    861     # StringIO doesn't like amt=None
--> 862     return self._fp.read(amt) if amt is not None else self._fp.read()

File C:\ProgramData\anaconda3\Lib\http\client.py:479, in HTTPResponse.read(self, amt)
    476 if self.length is not None and amt > self.length:
    477     # clip the read to the "end of response"
    478     amt = self.length
--> 479 s = self.fp.read(amt)
    480 if not s and amt:
    481     # Ideally, we would raise IncompleteRead if the content-length
    482     # wasn't satisfied, but it might break compatibility.
    483     self._close_conn()

File C:\ProgramData\anaconda3\Lib\socket.py:719, in SocketIO.readinto(self, b)
    717     raise OSError("cannot read from timed out object")
    718 try:
--> 719     return self._sock.recv_into(b)
    720 except timeout:
    721     self._timeout_occurred = True

File C:\ProgramData\anaconda3\Lib\ssl.py:1304, in SSLSocket.recv_into(self, buffer, nbytes, flags)
   1300     if flags != 0:
   1301         raise ValueError(
   1302           "non-zero flags not allowed in calls to recv_into() on %s" %
   1303           self.__class__)
-> 1304     return self.read(nbytes, buffer)
   1305 else:
   1306     return super().recv_into(buffer, nbytes, flags)

File C:\ProgramData\anaconda3\Lib\ssl.py:1138, in SSLSocket.read(self, len, buffer)
   1136 try:
   1137     if buffer is not None:
-> 1138         return self._sslobj.read(len, buffer)
   1139     else:
   1140         return self._sslobj.read(len)

KeyboardInterrupt: 
In [17]:
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
In [18]:
api.dataset_list_cli(search="jefmenegazzo/passive-vehicular-sensors-dataset-pvs")
ref                                                  title                                            size  lastUpdated                 downloadCount  voteCount  usabilityRating  
---------------------------------------------------  ----------------------------------------  -----------  --------------------------  -------------  ---------  ---------------  
jefmenegazzo/pvs-passive-vehicular-sensors-datasets  PVS - Passive Vehicular Sensors Datasets  44498315084  2021-01-27 20:26:04.113000          20832         78  0.9411765        
In [19]:
if os.path.exists(datasets_folder):
    shutil.rmtree(datasets_folder)

load_bar_datasets = tqdm(desc="Datasets Download", total=9)
load_bar_files = tqdm(desc="Files Download", total=3)

for dataset in range(1,10):
    dataset_path = os.path.join(datasets_folder, "PVS " + str(dataset)) 
    os.makedirs(dataset_path)
    load_bar_files.reset()

    for file in ["dataset_gps_mpu_left.csv", "dataset_gps_mpu_right.csv", "dataset_labels.csv"]:
        dataset_kaggle = 'jefmenegazzo/pvs-passive-vehicular-sensors-datasets'
        api.dataset_download_file(dataset=dataset_kaggle, file_name="PVS " + str(dataset) + "/" + file, path=dataset_path)
        load_bar_files.update(1)

    load_bar_datasets.update(1)
Datasets Download:   0%|          | 0/9 [00:00<?, ?it/s]
Files Download:   0%|          | 0/3 [00:00<?, ?it/s]
---------------------------------------------------------------------------
PermissionError                           Traceback (most recent call last)
Cell In[19], line 9
      7 for dataset in range(1,10):
      8     dataset_path = os.path.join(datasets_folder, "PVS " + str(dataset)) 
----> 9     os.makedirs(dataset_path)
     10     load_bar_files.reset()
     12     for file in ["dataset_gps_mpu_left.csv", "dataset_gps_mpu_right.csv", "dataset_labels.csv"]:

File C:\ProgramData\anaconda3\Lib\os.py:218, in makedirs(name, mode, exist_ok)
    216 if head and tail and not path.exists(head):
    217     try:
--> 218         makedirs(head, exist_ok=exist_ok)
    219     except FileExistsError:
    220         # Defeats race condition when another thread created the path
    221         pass

File C:\ProgramData\anaconda3\Lib\os.py:218, in makedirs(name, mode, exist_ok)
    216 if head and tail and not path.exists(head):
    217     try:
--> 218         makedirs(head, exist_ok=exist_ok)
    219     except FileExistsError:
    220         # Defeats race condition when another thread created the path
    221         pass

File C:\ProgramData\anaconda3\Lib\os.py:228, in makedirs(name, mode, exist_ok)
    226         return
    227 try:
--> 228     mkdir(name, mode)
    229 except OSError:
    230     # Cannot rely on checking for EEXIST, since the operating system
    231     # could give priority to other errors like EACCES or EROFS
    232     if not exist_ok or not path.isdir(name):

PermissionError: [WinError 5] Access is denied: '../input'
In [ ]:
from kaggle.api.kaggle_api_extended import KaggleApi
import pathlib, glob

DATA_DIR   = pathlib.Path.cwd() / "PVS_dataset"
SUBSET_DIR = pathlib.Path.cwd() / "PVS_subset"
DATA_DIR.mkdir(exist_ok=True); SUBSET_DIR.mkdir(exist_ok=True)

api = KaggleApi(); api.authenticate()
api.dataset_download_files(
    dataset="jefmenegazzo/pvs-passive-vehicular-sensors-datasets",
    path=str(DATA_DIR),
    unzip=False
)

ZIP_PATH = pathlib.Path(glob.glob(str(DATA_DIR/"*.zip"))[0])
Dataset URL: https://www.kaggle.com/datasets/jefmenegazzo/pvs-passive-vehicular-sensors-datasets
In [2]:
pvs-passive-vehicular-sensors-datasets.zip
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[2], line 1
----> 1 pvs-passive-vehicular-sensors-datasets.zip

NameError: name 'pvs' is not defined
In [3]:
Accident_Detection_Project_Dataset.zip
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 1
----> 1 Accident_Detection_Project_Dataset.zip

NameError: name 'Accident_Detection_Project_Dataset' is not defined
In [4]:
import pathlib, os, glob, pandas as pd

BASE_PATH = pathlib.Path(r"C:\Users\<sn161663>\Desktop\Accident_Detection_Project_Dataset")

print("Folders inside:", [f.name for f in BASE_PATH.iterdir() if f.is_dir()])
---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
Cell In[4], line 5
      1 import pathlib, os, glob, pandas as pd
      3 BASE_PATH = pathlib.Path(r"C:\Users\<sn161663>\Desktop\Accident_Detection_Project_Dataset")
----> 5 print("Folders inside:", [f.name for f in BASE_PATH.iterdir() if f.is_dir()])

File C:\ProgramData\anaconda3\Lib\pathlib\_local.py:575, in Path.iterdir(self)
    569 """Yield path objects of the directory contents.
    570 
    571 The children are yielded in arbitrary order, and the
    572 special entries '.' and '..' are not included.
    573 """
    574 root_dir = str(self)
--> 575 with os.scandir(root_dir) as scandir_it:
    576     paths = [entry.path for entry in scandir_it]
    577 if root_dir == '.':

OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: 'C:\\Users\\<sn161663>\\Desktop\\Accident_Detection_Project_Dataset'
In [5]:
import pathlib, os, glob, pandas as pd

BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")

print("Folders inside:", [f.name for f in BASE_PATH.iterdir() if f.is_dir()])
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[5], line 5
      1 import pathlib, os, glob, pandas as pd
      3 BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
----> 5 print("Folders inside:", [f.name for f in BASE_PATH.iterdir() if f.is_dir()])

File C:\ProgramData\anaconda3\Lib\pathlib\_local.py:575, in Path.iterdir(self)
    569 """Yield path objects of the directory contents.
    570 
    571 The children are yielded in arbitrary order, and the
    572 special entries '.' and '..' are not included.
    573 """
    574 root_dir = str(self)
--> 575 with os.scandir(root_dir) as scandir_it:
    576     paths = [entry.path for entry in scandir_it]
    577 if root_dir == '.':

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset'
In [7]:
import pathlib, os, glob, pandas as pd

BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset.zip")

print("Folders inside:", [f.name for f in BASE_PATH.iterdir() if f.is_dir()])
---------------------------------------------------------------------------
NotADirectoryError                        Traceback (most recent call last)
Cell In[7], line 5
      1 import pathlib, os, glob, pandas as pd
      3 BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset.zip")
----> 5 print("Folders inside:", [f.name for f in BASE_PATH.iterdir() if f.is_dir()])

File C:\ProgramData\anaconda3\Lib\pathlib\_local.py:575, in Path.iterdir(self)
    569 """Yield path objects of the directory contents.
    570 
    571 The children are yielded in arbitrary order, and the
    572 special entries '.' and '..' are not included.
    573 """
    574 root_dir = str(self)
--> 575 with os.scandir(root_dir) as scandir_it:
    576     paths = [entry.path for entry in scandir_it]
    577 if root_dir == '.':

NotADirectoryError: [WinError 267] The directory name is invalid: 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset.zip'
In [8]:
import pathlib, os, pandas as pd, glob

# paste your own full path below exactly as you copied it:
BASE_PATH = pathlib.Path(r"C:\Users\sn161663\OneDrive\Desktop\Accident_Detection_Project_Dataset")

# verify it exists
print("Exists?", BASE_PATH.exists())
Exists? False
In [9]:
import pathlib, os, pandas as pd, glob

# paste your own full path below exactly as you copied it:
BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset.zip")

# verify it exists
print("Exists?", BASE_PATH.exists())
Exists? True
In [10]:
import pathlib, os, glob, pandas as pd

BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset.zip")

print("Folders inside:", [f.name for f in BASE_PATH.iterdir() if f.is_dir()])
---------------------------------------------------------------------------
NotADirectoryError                        Traceback (most recent call last)
Cell In[10], line 5
      1 import pathlib, os, glob, pandas as pd
      3 BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset.zip")
----> 5 print("Folders inside:", [f.name for f in BASE_PATH.iterdir() if f.is_dir()])

File C:\ProgramData\anaconda3\Lib\pathlib\_local.py:575, in Path.iterdir(self)
    569 """Yield path objects of the directory contents.
    570 
    571 The children are yielded in arbitrary order, and the
    572 special entries '.' and '..' are not included.
    573 """
    574 root_dir = str(self)
--> 575 with os.scandir(root_dir) as scandir_it:
    576     paths = [entry.path for entry in scandir_it]
    577 if root_dir == '.':

NotADirectoryError: [WinError 267] The directory name is invalid: 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset.zip'
In [11]:
import pathlib, glob, pandas as pd, os

BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")

print("Exists?", BASE_PATH.exists())
print("Sub-folders:", [f.name for f in BASE_PATH.iterdir() if f.is_dir()])
Exists? True
Sub-folders: ['PVS 1', 'PVS 2', 'PVS 3', 'PVS 4', 'PVS 5', 'PVS 6', 'PVS 7', 'PVS 8', 'PVS 9']
In [12]:
csvs = sorted(glob.glob(str(BASE_PATH / "**/*.csv"), recursive=True))
print("Total CSV files found:", len(csvs))
print("First few files:")
for f in csvs[:10]:
    print(f)
Total CSV files found: 72
First few files:
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_gps.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_gps_mpu_left.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_gps_mpu_right.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_labels.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_mpu_left.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_mpu_right.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_settings_left.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_settings_right.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_gps.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_gps_mpu_left.csv
In [13]:
sample = csvs[0]
print("Sample file:", sample)
df = pd.read_csv(sample)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()
Sample file: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_gps.csv
Shape: (1467, 20)
Columns: ['timestamp', 'latitude', 'longitude', 'elevation', 'accuracy', 'bearing', 'speed_meters_per_second', 'satellites', 'provider', 'hdop', 'vdop', 'pdop', 'geoidheight', 'ageofdgpsdata', 'dgpsid', 'activity', 'battery', 'annotation', 'distance_meters', 'elapsed_time_seconds']
Out[13]:
timestamp latitude longitude elevation accuracy bearing speed_meters_per_second satellites provider hdop vdop pdop geoidheight ageofdgpsdata dgpsid activity battery annotation distance_meters elapsed_time_seconds
0 1.577219e+09 -27.717812 -51.098895 948.770836 24.0 159.73294 0.053275 0 gps 0.8 1.7 1.9 3.6 NaN NaN NaN 87 NaN 0.000000 0.0
1 1.577219e+09 -27.717818 -51.098840 970.378820 12.0 NaN 0.000000 12 gps 0.8 1.5 1.7 3.6 NaN NaN NaN 87 NaN 5.442520 2.0
2 1.577219e+09 -27.717832 -51.098871 989.374267 4.0 NaN 0.000000 13 gps 0.8 1.6 1.8 3.6 NaN NaN NaN 86 NaN 3.404871 5.0
3 1.577219e+09 -27.717833 -51.098867 988.439139 4.0 NaN 0.000000 14 gps 0.8 1.7 1.9 3.6 NaN NaN NaN 86 NaN 0.421733 1.0
4 1.577219e+09 -27.717835 -51.098873 987.668730 4.0 NaN 0.000000 14 gps 0.8 1.6 1.8 3.6 NaN NaN NaN 86 NaN 0.574281 1.0
In [14]:
import numpy as np

wanted_cols = ['timestamp','ax','ay','az','gx','gy','gz','lat','lon','speed']  # adjust based on step 3

def safe_read(path, wanted):
    cols = pd.read_csv(path, nrows=0).columns.tolist()
    use  = [c for c in wanted if c in cols]
    df   = pd.read_csv(path, usecols=use)
    # downcast numeric types to save RAM
    for c in df.select_dtypes(include=['float64','int64']).columns:
        df[c] = pd.to_numeric(df[c], downcast='float')
    df["folder"] = os.path.basename(os.path.dirname(path))
    df["file"]   = os.path.basename(path)
    return df

frames = []
for p in csvs:
    try:
        frames.append(safe_read(p, wanted_cols))
    except Exception as e:
        print("Skipping", p, ":", e)

df_all = pd.concat(frames, ignore_index=True, sort=False)
print("Combined shape:", df_all.shape)
df_all.head()
Combined shape: (4335374, 4)
Out[14]:
timestamp folder file speed
0 1.577219e+09 PVS 1 dataset_gps.csv NaN
1 1.577219e+09 PVS 1 dataset_gps.csv NaN
2 1.577219e+09 PVS 1 dataset_gps.csv NaN
3 1.577219e+09 PVS 1 dataset_gps.csv NaN
4 1.577219e+09 PVS 1 dataset_gps.csv NaN
In [15]:
out_path = BASE_PATH / "accident_subset.csv"
df_all.to_csv(out_path, index=False)
print("✅ Saved working subset to:", out_path)
✅ Saved working subset to: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\accident_subset.csv
In [16]:
df = pd.read_csv(out_path)
In [17]:
print(df.info())
print(df.describe().T)
print("Missing values:\n", df.isnull().sum())

# Quick plot example
import matplotlib.pyplot as plt
plt.figure(figsize=(10,4))
df['ax'].sample(min(50000, len(df['ax'].dropna()))).hist(bins=60)
plt.title("Distribution of ax"); plt.show()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4335374 entries, 0 to 4335373
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   timestamp  float64
 1   folder     object 
 2   file       object 
 3   speed      float64
dtypes: float64(2), object(2)
memory usage: 132.3+ MB
None
               count          mean           std           min           25%  \
timestamp  4335374.0  1.577307e+09  71719.256410  1.577219e+09  1.577223e+09   
speed      2161810.0  9.983220e+00      7.465368  0.000000e+00  4.719659e+00   

                    50%           75%           max  
timestamp  1.577309e+09  1.577396e+09  1.577400e+09  
speed      7.074296e+00  1.664852e+01  2.854857e+01  
Missing values:
 timestamp          0
folder             0
file               0
speed        2173564
dtype: int64
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\indexes\base.py:3805, in Index.get_loc(self, key)
   3804 try:
-> 3805     return self._engine.get_loc(casted_key)
   3806 except KeyError as err:

File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas\\_libs\\hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas\\_libs\\hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'ax'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[17], line 8
      6 import matplotlib.pyplot as plt
      7 plt.figure(figsize=(10,4))
----> 8 df['ax'].sample(min(50000, len(df['ax'].dropna()))).hist(bins=60)
      9 plt.title("Distribution of ax"); plt.show()

File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\frame.py:4102, in DataFrame.__getitem__(self, key)
   4100 if self.columns.nlevels > 1:
   4101     return self._getitem_multilevel(key)
-> 4102 indexer = self.columns.get_loc(key)
   4103 if is_integer(indexer):
   4104     indexer = [indexer]

File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\indexes\base.py:3812, in Index.get_loc(self, key)
   3807     if isinstance(casted_key, slice) or (
   3808         isinstance(casted_key, abc.Iterable)
   3809         and any(isinstance(x, slice) for x in casted_key)
   3810     ):
   3811         raise InvalidIndexError(key)
-> 3812     raise KeyError(key) from err
   3813 except TypeError:
   3814     # If we have a listlike key, _check_indexing_error will raise
   3815     #  InvalidIndexError. Otherwise we fall through and re-raise
   3816     #  the TypeError.
   3817     self._check_indexing_error(key)

KeyError: 'ax'
<Figure size 1000x400 with 0 Axes>
In [18]:
print("Columns in your dataframe:\n", df.columns.tolist())
Columns in your dataframe:
 ['timestamp', 'folder', 'file', 'speed']
In [19]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,4))
df['speed'].dropna().sample(min(50000, len(df['speed'].dropna()))).hist(bins=60)
plt.title("Distribution of Speed")
plt.xlabel("Speed")
plt.ylabel("Frequency")
plt.show()
No description has been provided for this image
In [20]:
plt.figure(figsize=(12,5))
plt.plot(df['timestamp'], df['speed'], color='blue', linewidth=0.5)
plt.title("Speed over Time")
plt.xlabel("Timestamp")
plt.ylabel("Speed")
plt.show()
No description has been provided for this image
In [21]:
import pandas as pd
df['time'] = pd.to_datetime(df['timestamp'], unit='s')
plt.figure(figsize=(12,5))
plt.plot(df['time'], df['speed'], linewidth=0.5)
plt.title("Speed over Time (datetime)")
plt.xlabel("Time")
plt.ylabel("Speed")
plt.show()
No description has been provided for this image
In [22]:
import glob, os
csvs = glob.glob(str(BASE_PATH / "**/*.csv"), recursive=True)
for p in csvs[:20]:
    print(p)
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\accident_subset.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_gps.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_gps_mpu_left.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_gps_mpu_right.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_labels.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_mpu_left.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_mpu_right.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_settings_left.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_settings_right.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_gps.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_gps_mpu_left.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_gps_mpu_right.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_labels.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_mpu_left.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_mpu_right.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_settings_left.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_settings_right.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 3\dataset_gps.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 3\dataset_gps_mpu_left.csv
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 3\dataset_gps_mpu_right.csv
In [23]:
df_test = pd.read_csv(csvs[0])
print(df_test.columns)
Index(['timestamp', 'folder', 'file', 'speed'], dtype='object')

STEP 2: DATA UNDERSTANDING¶

Inventory¶

In [24]:
import pathlib, os, glob, pandas as pd

BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
assert BASE_PATH.exists(), f"Path not found: {BASE_PATH}"

sessions = [p for p in BASE_PATH.iterdir() if p.is_dir()]
print("Sessions:", [s.name for s in sessions])

rows = []
for s in sessions:
    csvs = sorted(glob.glob(str(s / "*.csv")))
    for f in csvs:
        try:
            size_mb = os.path.getsize(f)/1e6
        except:
            size_mb = None
        rows.append({"session": s.name, "file_name": os.path.basename(f), "path": f, "size_MB": round(size_mb,2)})

inv = pd.DataFrame(rows).sort_values(["session","file_name"]).reset_index(drop=True)
display(inv.head(20))
inv.to_csv(BASE_PATH/"reports_data_inventory.csv", index=False)
print("✓ Saved:", BASE_PATH/"reports_data_inventory.csv")
Sessions: ['PVS 1', 'PVS 2', 'PVS 3', 'PVS 4', 'PVS 5', 'PVS 6', 'PVS 7', 'PVS 8', 'PVS 9']
session file_name path size_MB
0 PVS 1 dataset_gps.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 0.22
1 PVS 1 dataset_gps_mpu_left.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 84.48
2 PVS 1 dataset_gps_mpu_right.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 84.70
3 PVS 1 dataset_labels.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 4.18
4 PVS 1 dataset_mpu_left.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 75.58
5 PVS 1 dataset_mpu_right.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 75.79
6 PVS 1 dataset_settings_left.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 0.00
7 PVS 1 dataset_settings_right.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 0.00
8 PVS 2 dataset_gps.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 0.23
9 PVS 2 dataset_gps_mpu_left.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 72.72
10 PVS 2 dataset_gps_mpu_right.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 73.07
11 PVS 2 dataset_labels.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 3.62
12 PVS 2 dataset_mpu_left.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 65.09
13 PVS 2 dataset_mpu_right.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 65.44
14 PVS 2 dataset_settings_left.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 0.00
15 PVS 2 dataset_settings_right.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 0.00
16 PVS 3 dataset_gps.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 0.20
17 PVS 3 dataset_gps_mpu_left.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 62.57
18 PVS 3 dataset_gps_mpu_right.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 62.45
19 PVS 3 dataset_labels.csv C:\Users\sn161663\Desktop\Accident_Detection_P... 3.07
✓ Saved: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\reports_data_inventory.csv

Schema Discovery¶

In [25]:
import pandas as pd

schema_rows = []
for _, r in inv.iterrows():
    path = r["path"]
    try:
        cols = pd.read_csv(path, nrows=0).columns.tolist()
        dtypes = pd.read_csv(path, nrows=1000).infer_objects().dtypes.astype(str).to_dict()
        schema_rows.append({
            "session": r["session"],
            "file_name": r["file_name"],
            "columns": "|".join(cols),
            "example_dtypes": "|".join([f"{k}:{v}" for k,v in dtypes.items()])
        })
    except Exception as e:
        schema_rows.append({"session": r["session"], "file_name": r["file_name"], "columns": f"ERROR: {e}", "example_dtypes": ""})

schema = pd.DataFrame(schema_rows)
display(schema.head(20))
schema.to_csv(BASE_PATH/"reports_schema_overview.csv", index=False)
print("✓ Saved:", BASE_PATH/"reports_schema_overview.csv")
session file_name columns example_dtypes
0 PVS 1 dataset_gps.csv timestamp|latitude|longitude|elevation|accurac... timestamp:float64|latitude:float64|longitude:f...
1 PVS 1 dataset_gps_mpu_left.csv timestamp|acc_x_dashboard|acc_y_dashboard|acc_... timestamp:float64|acc_x_dashboard:float64|acc_...
2 PVS 1 dataset_gps_mpu_right.csv timestamp|acc_x_dashboard|acc_y_dashboard|acc_... timestamp:float64|acc_x_dashboard:float64|acc_...
3 PVS 1 dataset_labels.csv paved_road|unpaved_road|dirt_road|cobblestone_... paved_road:int64|unpaved_road:int64|dirt_road:...
4 PVS 1 dataset_mpu_left.csv timestamp|acc_x_dashboard|acc_y_dashboard|acc_... timestamp:float64|acc_x_dashboard:float64|acc_...
5 PVS 1 dataset_mpu_right.csv timestamp|acc_x_dashboard|acc_y_dashboard|acc_... timestamp:float64|acc_x_dashboard:float64|acc_...
6 PVS 1 dataset_settings_left.csv placement|address_mpu|address_ak|gyroscope_ful... placement:object|address_mpu:object|address_ak...
7 PVS 1 dataset_settings_right.csv placement|address_mpu|address_ak|gyroscope_ful... placement:object|address_mpu:object|address_ak...
8 PVS 2 dataset_gps.csv timestamp|latitude|longitude|elevation|accurac... timestamp:float64|latitude:float64|longitude:f...
9 PVS 2 dataset_gps_mpu_left.csv timestamp|acc_x_dashboard|acc_y_dashboard|acc_... timestamp:float64|acc_x_dashboard:float64|acc_...
10 PVS 2 dataset_gps_mpu_right.csv timestamp|acc_x_dashboard|acc_y_dashboard|acc_... timestamp:float64|acc_x_dashboard:float64|acc_...
11 PVS 2 dataset_labels.csv paved_road|unpaved_road|dirt_road|cobblestone_... paved_road:int64|unpaved_road:int64|dirt_road:...
12 PVS 2 dataset_mpu_left.csv timestamp|acc_x_dashboard|acc_y_dashboard|acc_... timestamp:float64|acc_x_dashboard:float64|acc_...
13 PVS 2 dataset_mpu_right.csv timestamp|acc_x_dashboard|acc_y_dashboard|acc_... timestamp:float64|acc_x_dashboard:float64|acc_...
14 PVS 2 dataset_settings_left.csv placement|address_mpu|address_ak|gyroscope_ful... placement:object|address_mpu:object|address_ak...
15 PVS 2 dataset_settings_right.csv placement|address_mpu|address_ak|gyroscope_ful... placement:object|address_mpu:object|address_ak...
16 PVS 3 dataset_gps.csv timestamp|latitude|longitude|elevation|accurac... timestamp:float64|latitude:float64|longitude:f...
17 PVS 3 dataset_gps_mpu_left.csv timestamp|acc_x_dashboard|acc_y_dashboard|acc_... timestamp:float64|acc_x_dashboard:float64|acc_...
18 PVS 3 dataset_gps_mpu_right.csv timestamp|acc_x_dashboard|acc_y_dashboard|acc_... timestamp:float64|acc_x_dashboard:float64|acc_...
19 PVS 3 dataset_labels.csv paved_road|unpaved_road|dirt_road|cobblestone_... paved_road:int64|unpaved_road:int64|dirt_road:...
✓ Saved: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\reports_schema_overview.csv

Sampling rate & timestamp health¶

In [26]:
import numpy as np

def quick_time_health(csv_path, ts_col="timestamp", n_rows=300_000):
    try:
        use = pd.read_csv(csv_path, usecols=[ts_col], nrows=n_rows)
    except:
        return {"has_timestamp": False, "monotonic": None, "median_dt_ms": None, "notes": "timestamp missing or unreadable"}
    ts = use[ts_col].dropna().values
    if len(ts) < 3:
        return {"has_timestamp": True, "monotonic": None, "median_dt_ms": None, "notes": "too few timestamps"}
    diffs = np.diff(ts)
    mono = np.all(diffs >= 0)
    median_dt = np.median(diffs) * 1000 if np.median(diffs) < 10 else np.median(diffs)  # adjust if ts is in seconds
    return {"has_timestamp": True, "monotonic": bool(mono), "median_dt_ms": round(float(median_dt),2), "notes": ""}

health = []
for _, r in inv.iterrows():
    h = quick_time_health(r["path"])
    h.update({"session": r["session"], "file_name": r["file_name"]})
    health.append(h)

ts_health = pd.DataFrame(health)
display(ts_health.head(20))
ts_health.to_csv(BASE_PATH/"reports_timestamp_health.csv", index=False)
print("✓ Saved:", BASE_PATH/"reports_timestamp_health.csv")
has_timestamp monotonic median_dt_ms notes session file_name
0 True True 1000.0 PVS 1 dataset_gps.csv
1 True True 10.0 PVS 1 dataset_gps_mpu_left.csv
2 True True 10.0 PVS 1 dataset_gps_mpu_right.csv
3 False None NaN timestamp missing or unreadable PVS 1 dataset_labels.csv
4 True True 10.0 PVS 1 dataset_mpu_left.csv
5 True True 10.0 PVS 1 dataset_mpu_right.csv
6 False None NaN timestamp missing or unreadable PVS 1 dataset_settings_left.csv
7 False None NaN timestamp missing or unreadable PVS 1 dataset_settings_right.csv
8 True True 1000.0 PVS 2 dataset_gps.csv
9 True True 10.0 PVS 2 dataset_gps_mpu_left.csv
10 True True 10.0 PVS 2 dataset_gps_mpu_right.csv
11 False None NaN timestamp missing or unreadable PVS 2 dataset_labels.csv
12 True True 10.0 PVS 2 dataset_mpu_left.csv
13 True True 10.0 PVS 2 dataset_mpu_right.csv
14 False None NaN timestamp missing or unreadable PVS 2 dataset_settings_left.csv
15 False None NaN timestamp missing or unreadable PVS 2 dataset_settings_right.csv
16 True True 1000.0 PVS 3 dataset_gps.csv
17 True True 10.0 PVS 3 dataset_gps_mpu_left.csv
18 True True 10.0 PVS 3 dataset_gps_mpu_right.csv
19 False None NaN timestamp missing or unreadable PVS 3 dataset_labels.csv
✓ Saved: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\reports_timestamp_health.csv

Data Quality¶

In [28]:
from collections import defaultdict

def safe_numeric_summary(path, nrows=300_000):
    try:
        df = pd.read_csv(path, nrows=nrows)
    except Exception as e:
        return {"error": str(e)}
    summary = {}
    for c in df.columns:
        if pd.api.types.is_numeric_dtype(df[c]):
            col = df[c].dropna()
            if len(col) == 0:
                summary[c] = {"count": 0, "missing": int(df[c].isna().sum())}
            else:
                summary[c] = {
                    "count": int(col.shape[0]),
                    "missing": int(df[c].isna().sum()),
                    "min": float(col.min()),
                    "max": float(col.max()),
                    "mean": float(col.mean()),
                    "std": float(col.std()) if col.shape[0] > 1 else 0.0
                }
    return summary

sum_rows = []
for _, r in inv.iterrows():
    s = safe_numeric_summary(r["path"])
    if "error" in s:
        sum_rows.append({"session": r["session"], "file_name": r["file_name"], "metric": "error", "value": s["error"], "column": ""})
    else:
        for col, stats in s.items():
            for k,v in stats.items():
                sum_rows.append({"session": r["session"], "file_name": r["file_name"], "column": col, "metric": k, "value": v})

num_summary = pd.DataFrame(sum_rows)
display(num_summary.head(30))
num_summary.to_csv(BASE_PATH/"reports_numeric_summary_sampled.csv", index=False)
print("✓ Saved:", BASE_PATH/"reports_numeric_summary_sampled.csv")
session file_name column metric value
0 PVS 1 dataset_gps.csv timestamp count 1.467000e+03
1 PVS 1 dataset_gps.csv timestamp missing 0.000000e+00
2 PVS 1 dataset_gps.csv timestamp min 1.577219e+09
3 PVS 1 dataset_gps.csv timestamp max 1.577220e+09
4 PVS 1 dataset_gps.csv timestamp mean 1.577219e+09
5 PVS 1 dataset_gps.csv timestamp std 4.307223e+02
6 PVS 1 dataset_gps.csv latitude count 1.467000e+03
7 PVS 1 dataset_gps.csv latitude missing 0.000000e+00
8 PVS 1 dataset_gps.csv latitude min -2.771784e+01
9 PVS 1 dataset_gps.csv latitude max -2.768182e+01
10 PVS 1 dataset_gps.csv latitude mean -2.769508e+01
11 PVS 1 dataset_gps.csv latitude std 1.175118e-02
12 PVS 1 dataset_gps.csv longitude count 1.467000e+03
13 PVS 1 dataset_gps.csv longitude missing 0.000000e+00
14 PVS 1 dataset_gps.csv longitude min -5.113269e+01
15 PVS 1 dataset_gps.csv longitude max -5.109884e+01
16 PVS 1 dataset_gps.csv longitude mean -5.111933e+01
17 PVS 1 dataset_gps.csv longitude std 1.137600e-02
18 PVS 1 dataset_gps.csv elevation count 1.467000e+03
19 PVS 1 dataset_gps.csv elevation missing 0.000000e+00
20 PVS 1 dataset_gps.csv elevation min 8.748351e+02
21 PVS 1 dataset_gps.csv elevation max 9.959747e+02
22 PVS 1 dataset_gps.csv elevation mean 9.255807e+02
23 PVS 1 dataset_gps.csv elevation std 4.069137e+01
24 PVS 1 dataset_gps.csv accuracy count 1.467000e+03
25 PVS 1 dataset_gps.csv accuracy missing 0.000000e+00
26 PVS 1 dataset_gps.csv accuracy min 4.000000e+00
27 PVS 1 dataset_gps.csv accuracy max 2.400000e+01
28 PVS 1 dataset_gps.csv accuracy mean 4.092706e+00
29 PVS 1 dataset_gps.csv accuracy std 6.747199e-01
✓ Saved: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\reports_numeric_summary_sampled.csv

Label Reconnaissance¶

In [29]:
possible_label_names = {"label","labels","class","target","event","accident"}
label_hits = []
for _, r in inv.iterrows():
    try:
        cols = pd.read_csv(r["path"], nrows=0).columns
        found = [c for c in cols if c.lower() in possible_label_names]
        if found:
            label_hits.append({"session": r["session"], "file_name": r["file_name"], "label_cols": "|".join(found)})
    except:
        pass

label_index = pd.DataFrame(label_hits)
display(label_index if not label_index.empty else "No explicit label columns found.")
if not label_index.empty:
    label_index.to_csv(BASE_PATH/"reports_label_columns.csv", index=False)
'No explicit label columns found.'

Numeric Data Quality Checks

In [30]:
import numpy as np

def sample_numeric_stats(path, nrows=200_000):
    try:
        df = pd.read_csv(path, nrows=nrows)
        numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
        summary = df[numeric_cols].describe().T
        summary["missing_%"] = df[numeric_cols].isna().mean()*100
        summary["file"] = os.path.basename(path)
        return summary
    except Exception as e:
        return pd.DataFrame({"error":[str(e)], "file":[path]})

numeric_summaries = []
for _, r in inv.iterrows():
    numeric_summaries.append(sample_numeric_stats(r["path"]))

num_df = pd.concat(numeric_summaries, ignore_index=True, sort=False)
display(num_df.head(10))
num_df.to_csv(BASE_PATH/"reports_numeric_overview.csv", index=False)
print("✓ Saved:", BASE_PATH/"reports_numeric_overview.csv")
count mean std min 25% 50% 75% max missing_% file
0 1467.0 1.577219e+09 430.722270 1.577219e+09 1.577219e+09 1.577219e+09 1.577220e+09 1.577220e+09 0.000000 dataset_gps.csv
1 1467.0 -2.769508e+01 0.011751 -2.771784e+01 -2.770213e+01 -2.768987e+01 -2.768708e+01 -2.768182e+01 0.000000 dataset_gps.csv
2 1467.0 -5.111933e+01 0.011376 -5.113269e+01 -5.112895e+01 -5.112469e+01 -5.110955e+01 -5.109884e+01 0.000000 dataset_gps.csv
3 1467.0 9.255807e+02 40.691374 8.748351e+02 8.890266e+02 9.084693e+02 9.615848e+02 9.959747e+02 0.000000 dataset_gps.csv
4 1467.0 4.092706e+00 0.674720 4.000000e+00 4.000000e+00 4.000000e+00 4.000000e+00 2.400000e+01 0.000000 dataset_gps.csv
5 1458.0 2.136291e+02 95.640711 1.006545e+00 1.382972e+02 1.973806e+02 3.171247e+02 3.597907e+02 0.613497 dataset_gps.csv
6 1467.0 9.286443e+00 7.820981 0.000000e+00 4.271836e+00 6.516178e+00 1.495587e+01 2.687448e+01 0.000000 dataset_gps.csv
7 1467.0 1.506748e+01 0.666034 0.000000e+00 1.500000e+01 1.500000e+01 1.500000e+01 1.600000e+01 0.000000 dataset_gps.csv
8 1467.0 8.117928e-01 0.037540 8.000000e-01 8.000000e-01 8.000000e-01 8.000000e-01 1.100000e+00 0.000000 dataset_gps.csv
9 1467.0 1.437832e+00 0.154977 1.000000e+00 1.300000e+00 1.500000e+00 1.600000e+00 2.100000e+00 0.000000 dataset_gps.csv
✓ Saved: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\reports_numeric_overview.csv

Sample Visualizations - Numeric Distribution¶

In [31]:
import matplotlib.pyplot as plt

sample_path = inv.iloc[0]["path"]
df_sample = pd.read_csv(sample_path, nrows=100_000)

numeric_cols = [c for c in df_sample.columns if pd.api.types.is_numeric_dtype(df_sample[c]) and c!="timestamp"]

for c in numeric_cols[:5]:
    plt.figure(figsize=(8,3))
    df_sample[c].hist(bins=60)
    plt.title(f"Distribution of {c}")
    plt.xlabel(c)
    plt.ylabel("Frequency")
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Sample Visualizations - Short Time-Series Preview¶

In [33]:
if "speed" in df_sample.columns:
    plt.figure(figsize=(12,4))
    plt.plot(df_sample["speed"].iloc[:10000])
    plt.title("Speed – First 10k samples")
    plt.xlabel("Sample index")
    plt.ylabel("Speed")
    plt.show()

# if accel axes exist
axes = [a for a in ["ax","ay","az"] if a in df_sample.columns]
if len(axes)==3:
    accel_mag = np.sqrt((df_sample[axes]**2).sum(axis=1))
    plt.figure(figsize=(12,4))
    plt.plot(accel_mag.iloc[:10000])
    plt.title("Acceleration Magnitude – First 10k samples")
    plt.xlabel("Sample index")
    plt.ylabel("|a|")
    plt.show()

GPS & Spatial Sanity¶

In [34]:
if {"lat","lon"}.issubset(df_sample.columns):
    lat_ok = df_sample["lat"].between(-90,90).mean()
    lon_ok = df_sample["lon"].between(-180,180).mean()
    print(f"Latitude valid ratio: {lat_ok:.3f}, Longitude valid ratio: {lon_ok:.3f}")

Preliminary Correlation Check¶

In [35]:
import seaborn as sns

numeric_cols = [c for c in df_sample.columns if pd.api.types.is_numeric_dtype(df_sample[c])]
plt.figure(figsize=(10,6))
sns.heatmap(df_sample[numeric_cols].corr(), cmap="coolwarm", center=0)
plt.title("Feature Correlation Heatmap (sample)")
plt.show()
No description has been provided for this image

STEP 3: DATA PREPARATION¶

Setup - Folders + Configuration¶

In [22]:
import pathlib, json, os, glob
import numpy as np
import pandas as pd

# 1) Point to your extracted folder that contains: PVS 1, PVS 2, ... PVS 9
BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
assert BASE_PATH.exists(), f"Path not found: {BASE_PATH}"

# 2) Working folders
WORK   = BASE_PATH / "work"
CLEAN  = WORK / "clean_resampled"    # synchronized, filtered per session
WINDOWS= WORK / "windows"            # feature windows per session
for d in (WORK, CLEAN, WINDOWS):
    d.mkdir(parents=True, exist_ok=True)

# 3) Data-prep configuration (tune later if needed)
CFG = {
    "resample_hz": 50,          # common rate (Hz)
    "lowpass_hz": 15,           # accel/gyro low-pass cutoff (Hz)
    "window_sec": 3.0,          # sliding window length
    "overlap": 0.5,             # 50% overlap
    # Proxy-label thresholds (units: see notes below)
    "accel_mag_g": 1.8,         # ≈1.8 g (if your accel is already m/s^2, multiply by 9.81 later)
    "gyro_mag_dps": 250,        # deg/s
    "speed_drop_mps": 4.0,      # sudden speed drop over short window (m/s)
    "speed_drop_window_s": 1.0
}
(WORK / "config.json").write_text(json.dumps(CFG, indent=2))
print("✓ Config written:", WORK / "config.json")
✓ Config written: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\config.json

Discover files per session¶

In [14]:
# list sessions
SESSIONS = [p for p in BASE_PATH.iterdir() if p.is_dir() and p.name.lower().startswith("pvs")]
SESSIONS = sorted(SESSIONS, key=lambda p: p.name)
print("Sessions found:", [s.name for s in SESSIONS])

def find_files(session_path):
    files = sorted(glob.glob(str(session_path / "*.csv")))
    out = {"accel": None, "gyro": None, "gps": None, "labels": None}
    for f in files:
        try:
            cols = pd.read_csv(f, nrows=0).columns.str.lower().tolist()
        except Exception:
            continue
        cs = set(cols)
        if {"ax","ay","az"}.issubset(cs): out["accel"] = f
        if {"gx","gy","gz"}.issubset(cs): out["gyro"]  = f
        if {"lat","lon"}.issubset(cs) or ("speed" in cs): out["gps"] = f
        if any(c in cs for c in ["label","labels","class","target","event","accident"]): out["labels"] = f
    return out

file_map = {s.name: find_files(s) for s in SESSIONS}
file_map
Sessions found: ['PVS 1', 'PVS 2', 'PVS 3', 'PVS 4', 'PVS 5', 'PVS 6', 'PVS 7', 'PVS 8', 'PVS 9']
Out[14]:
{'PVS 1': {'accel': None,
  'gyro': None,
  'gps': 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset\\PVS 1\\dataset_gps_mpu_right.csv',
  'labels': None},
 'PVS 2': {'accel': None,
  'gyro': None,
  'gps': 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset\\PVS 2\\dataset_gps_mpu_right.csv',
  'labels': None},
 'PVS 3': {'accel': None,
  'gyro': None,
  'gps': 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset\\PVS 3\\dataset_gps_mpu_right.csv',
  'labels': None},
 'PVS 4': {'accel': None,
  'gyro': None,
  'gps': 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset\\PVS 4\\dataset_gps_mpu_right.csv',
  'labels': None},
 'PVS 5': {'accel': None,
  'gyro': None,
  'gps': 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset\\PVS 5\\dataset_gps_mpu_right.csv',
  'labels': None},
 'PVS 6': {'accel': None,
  'gyro': None,
  'gps': 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset\\PVS 6\\dataset_gps_mpu_right.csv',
  'labels': None},
 'PVS 7': {'accel': None,
  'gyro': None,
  'gps': 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset\\PVS 7\\dataset_gps_mpu_right.csv',
  'labels': None},
 'PVS 8': {'accel': None,
  'gyro': None,
  'gps': 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset\\PVS 8\\dataset_gps_mpu_right.csv',
  'labels': None},
 'PVS 9': {'accel': None,
  'gyro': None,
  'gps': 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset\\PVS 9\\dataset_gps_mpu_right.csv',
  'labels': None}}

Timestamp, filters, resampling, synchronization - Helpers¶

In [15]:
from scipy.signal import butter, filtfilt

def ensure_time_index(df, ts_col="timestamp"):
    if ts_col not in df.columns:
        raise ValueError(f"timestamp column '{ts_col}' not found.")
    ts = df[ts_col].astype("float64")
    # seconds vs milliseconds heuristic
    if ts.dropna().median() > 1e11:
        t = pd.to_datetime(ts, unit="ms", errors="coerce")
    else:
        t = pd.to_datetime(ts, unit="s", errors="coerce")
    out = df.drop(columns=[ts_col]).copy()
    out.index = t
    out = out[~out.index.duplicated(keep="first")].sort_index()
    return out

def butter_lowpass(series, cutoff_hz, fs_hz, order=4):
    if series.isna().all():
        return series
    b, a = butter(order, cutoff_hz/(0.5*fs_hz), btype='low', analog=False)
    x = series.interpolate(limit=5).bfill().ffill()
    try:
        y = filtfilt(b, a, x.values)
        return pd.Series(y, index=series.index)
    except Exception:
        return series

def resample_df(df, fs_hz, agg="mean"):
    rule = f"{int(1000/fs_hz)}ms"       # 'ms' (milliseconds)
    num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    return df[num_cols].resample(rule).agg(agg)

def sync_and_filter(session_name, files, cfg=CFG):
    parts = []
    # Accelerometer
    if files["accel"]:
        a = pd.read_csv(files["accel"])
        a = ensure_time_index(a, "timestamp")
        for ax in [c for c in ["ax","ay","az"] if c in a.columns]:
            a[ax] = butter_lowpass(a[ax], cfg["lowpass_hz"], cfg["resample_hz"])
        a = resample_df(a, cfg["resample_hz"], "mean")
        parts.append(a.add_prefix("acc_"))
    # Gyroscope
    if files["gyro"]:
        g = pd.read_csv(files["gyro"])
        g = ensure_time_index(g, "timestamp")
        for gx in [c for c in ["gx","gy","gz"] if c in g.columns]:
            g[gx] = butter_lowpass(g[gx], cfg["lowpass_hz"], cfg["resample_hz"])
        g = resample_df(g, cfg["resample_hz"], "mean")
        parts.append(g.add_prefix("gyro_"))
    # GPS / speed
    if files["gps"]:
        gps = pd.read_csv(files["gps"])
        gps = ensure_time_index(gps, "timestamp")
        gps = resample_df(gps, cfg["resample_hz"], "mean")
        parts.append(gps.add_prefix("gps_"))

    if not parts:
        raise RuntimeError(f"No usable sensor files for {session_name}")

    df = parts[0].join(parts[1:], how="outer").sort_index()
    df = df.interpolate(limit=5).ffill().bfill()
    df["session"] = session_name
    return df

Proxy Labels¶

In [16]:
def add_proxy_columns(df, cfg=CFG):
    # acceleration magnitude (if present)
    if all(c in df.columns for c in ["acc_ax","acc_ay","acc_az"]):
        df["acc_mag"] = np.sqrt(df["acc_ax"]**2 + df["acc_ay"]**2 + df["acc_az"]**2)
    # gyro magnitude (if present)
    if all(c in df.columns for c in ["gyro_gx","gyro_gy","gyro_gz"]):
        df["gyro_mag"] = np.sqrt(df["gyro_gx"]**2 + df["gyro_gy"]**2 + df["gyro_gz"]**2)
    # sudden speed drop (m/s) over short window
    if "gps_speed" in df.columns:
        k = max(int(cfg["speed_drop_window_s"] * cfg["resample_hz"]), 2)
        s = df["gps_speed"].ffill()
        df["speed_drop"] = s - s.rolling(k, min_periods=1).min()
    else:
        df["speed_drop"] = np.nan

    conds = []
    if "acc_mag" in df.columns:
        conds.append(df["acc_mag"] >= cfg["accel_mag_g"] * 9.81)   # remove *9.81 if accel already in g
    if "gyro_mag" in df.columns:
        conds.append(df["gyro_mag"] >= cfg["gyro_mag_dps"])
    if df["speed_drop"].notna().any():
        conds.append(df["speed_drop"] >= cfg["speed_drop_mps"])

    df["proxy_incident"] = 0
    if conds:
        df.loc[np.logical_or.reduce(conds), "proxy_incident"] = 1
    return df

Process each session - save clean, resampled, labeled data¶

In [17]:
PROCESSED = []
for s in SESSIONS:
    files = file_map[s.name]
    try:
        df = sync_and_filter(s.name, files, CFG)
        df = add_proxy_columns(df, CFG)
        out = CLEAN / f"{s.name}.parquet"
        df.to_parquet(out, index=True)
        PROCESSED.append({"session": s.name, "rows": len(df), "path": str(out)})
        print(f"✓ {s.name}: {len(df):,} rows -> {out.name}")
    except Exception as e:
        print(f"✗ {s.name}:", e)

pd.DataFrame(PROCESSED).to_csv(WORK/"processed_sessions.csv", index=False)
print("Saved index:", WORK/"processed_sessions.csv")
✓ PVS 1: 72,019 rows -> PVS 1.parquet
✓ PVS 2: 62,343 rows -> PVS 2.parquet
✓ PVS 3: 52,908 rows -> PVS 3.parquet
✓ PVS 4: 66,246 rows -> PVS 4.parquet
✓ PVS 5: 66,939 rows -> PVS 5.parquet
✓ PVS 6: 48,141 rows -> PVS 6.parquet
✓ PVS 7: 64,274 rows -> PVS 7.parquet
✓ PVS 8: 61,810 rows -> PVS 8.parquet
✓ PVS 9: 45,778 rows -> PVS 9.parquet
Saved index: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\processed_sessions.csv
In [8]:
test_df = pd.read_parquet(CLEAN / "PVS 1.parquet")
test_df.head(), test_df.columns.tolist()[:20]
Out[8]:
(                         gps_acc_x_dashboard  gps_acc_y_dashboard  \
 timestamp                                                           
 2019-12-24 20:19:56.540             0.314897             0.187227   
 2019-12-24 20:19:56.560             0.297539             0.187227   
 2019-12-24 20:19:56.580             0.308912             0.199198   
 2019-12-24 20:19:56.600             0.317292             0.157299   
 2019-12-24 20:19:56.620             0.295744             0.148919   
 
                          gps_acc_z_dashboard  gps_acc_x_above_suspension  \
 timestamp                                                                  
 2019-12-24 20:19:56.540             9.863572                    0.314750   
 2019-12-24 20:19:56.560             9.869558                    0.313553   
 2019-12-24 20:19:56.580             9.842024                    0.332706   
 2019-12-24 20:19:56.600             9.859981                    0.297991   
 2019-12-24 20:19:56.620             9.885120                    0.266866   
 
                          gps_acc_y_above_suspension  \
 timestamp                                             
 2019-12-24 20:19:56.540                    0.166426   
 2019-12-24 20:19:56.560                    0.154455   
 2019-12-24 20:19:56.580                    0.159244   
 2019-12-24 20:19:56.600                    0.156849   
 2019-12-24 20:19:56.620                    0.156849   
 
                          gps_acc_z_above_suspension  \
 timestamp                                             
 2019-12-24 20:19:56.540                    9.808869   
 2019-12-24 20:19:56.560                    9.855556   
 2019-12-24 20:19:56.580                    9.831614   
 2019-12-24 20:19:56.600                    9.824431   
 2019-12-24 20:19:56.620                    9.835205   
 
                          gps_acc_x_below_suspension  \
 timestamp                                             
 2019-12-24 20:19:56.540                    0.529819   
 2019-12-24 20:19:56.560                    0.525031   
 2019-12-24 20:19:56.580                    0.533411   
 2019-12-24 20:19:56.600                    0.498695   
 2019-12-24 20:19:56.620                    0.496300   
 
                          gps_acc_y_below_suspension  \
 timestamp                                             
 2019-12-24 20:19:56.540                    0.097111   
 2019-12-24 20:19:56.560                    0.100702   
 2019-12-24 20:19:56.580                    0.092323   
 2019-12-24 20:19:56.600                    0.404766   
 2019-12-24 20:19:56.620                    0.097111   
 
                          gps_acc_z_below_suspension  gps_gyro_x_dashboard  \
 timestamp                                                                   
 2019-12-24 20:19:56.540                    9.930623              0.221062   
 2019-12-24 20:19:56.560                    9.948579              0.045586   
 2019-12-24 20:19:56.580                    9.887527              0.175285   
 2019-12-24 20:19:56.600                    9.923440              0.205803   
 2019-12-24 20:19:56.620                    9.856403             -0.206184   
 
                          ...  gps_temp_dashboard  gps_temp_above_suspension  \
 timestamp                ...                                                  
 2019-12-24 20:19:56.540  ...           34.274628                  34.035014   
 2019-12-24 20:19:56.560  ...           34.358493                  34.082936   
 2019-12-24 20:19:56.580  ...           34.370474                  33.939168   
 2019-12-24 20:19:56.600  ...           34.514242                  33.963129   
 2019-12-24 20:19:56.620  ...           34.370474                  34.082936   
 
                          gps_temp_below_suspension  gps_timestamp_gps  \
 timestamp                                                               
 2019-12-24 20:19:56.540                  31.926408       1.577219e+09   
 2019-12-24 20:19:56.560                  31.734717       1.577219e+09   
 2019-12-24 20:19:56.580                  31.447180       1.577219e+09   
 2019-12-24 20:19:56.600                  31.447180       1.577219e+09   
 2019-12-24 20:19:56.620                  31.638871       1.577219e+09   
 
                          gps_latitude  gps_longitude  gps_speed  session  \
 timestamp                                                                  
 2019-12-24 20:19:56.540    -27.717841     -51.098865   0.009128    PVS 1   
 2019-12-24 20:19:56.560    -27.717841     -51.098865   0.009128    PVS 1   
 2019-12-24 20:19:56.580    -27.717841     -51.098865   0.009128    PVS 1   
 2019-12-24 20:19:56.600    -27.717841     -51.098865   0.009128    PVS 1   
 2019-12-24 20:19:56.620    -27.717841     -51.098865   0.009128    PVS 1   
 
                          speed_drop  proxy_incident  
 timestamp                                            
 2019-12-24 20:19:56.540         0.0               0  
 2019-12-24 20:19:56.560         0.0               0  
 2019-12-24 20:19:56.580         0.0               0  
 2019-12-24 20:19:56.600         0.0               0  
 2019-12-24 20:19:56.620         0.0               0  
 
 [5 rows x 34 columns],
 ['gps_acc_x_dashboard',
  'gps_acc_y_dashboard',
  'gps_acc_z_dashboard',
  'gps_acc_x_above_suspension',
  'gps_acc_y_above_suspension',
  'gps_acc_z_above_suspension',
  'gps_acc_x_below_suspension',
  'gps_acc_y_below_suspension',
  'gps_acc_z_below_suspension',
  'gps_gyro_x_dashboard',
  'gps_gyro_y_dashboard',
  'gps_gyro_z_dashboard',
  'gps_gyro_x_above_suspension',
  'gps_gyro_y_above_suspension',
  'gps_gyro_z_above_suspension',
  'gps_gyro_x_below_suspension',
  'gps_gyro_y_below_suspension',
  'gps_gyro_z_below_suspension',
  'gps_mag_x_dashboard',
  'gps_mag_y_dashboard'])

Windowing - create training instances (features + label)¶

In [18]:
def window_iter(df, win_s, step_s, fs):
    n = len(df); win = int(win_s * fs); step = int(step_s * fs)
    i = 0
    while i + win <= n:
        seg = df.iloc[i:i+win]
        yield seg.index[0], seg.index[-1], seg
        i += step

def basic_window_features(seg):
    feats = {}
    cols = [c for c in seg.columns if pd.api.types.is_numeric_dtype(seg[c])]
    for c in cols:
        x = seg[c].dropna()
        if x.empty: continue
        feats[f"{c}_mean"] = float(x.mean())
        feats[f"{c}_std"]  = float(x.std())
        feats[f"{c}_min"]  = float(x.min())
        feats[f"{c}_max"]  = float(x.max())
        feats[f"{c}_rms"]  = float(np.sqrt(np.mean(np.square(x))))
    return feats

def build_windows_for_session(session_name, cfg=CFG):
    path = CLEAN / f"{session_name}.parquet"
    df = pd.read_parquet(path)

    fs = cfg["resample_hz"]
    win_s = cfg["window_sec"]
    step_s = cfg["window_sec"] * (1 - cfg["overlap"])

    rows = []
    for t0, t1, seg in window_iter(df, win_s, step_s, fs):
        feats = basic_window_features(seg)
        y = int(seg["proxy_incident"].max()) if "proxy_incident" in seg.columns else 0
        rows.append({"session": session_name, "t_start": t0, "t_end": t1, "y": y, **feats})

    win_df = pd.DataFrame(rows)
    out = WINDOWS / f"{session_name}_windows.parquet"
    win_df.to_parquet(out, index=False)
    return out, len(win_df), (win_df["y"].mean() if len(win_df) else 0.0)
In [19]:
SUMMARY = []
for s in SESSIONS:
    p = CLEAN / f"{s.name}.parquet"
    if not p.exists():
        print(f"Skipping {s.name} (no cleaned file)")
        continue
    out, nrows, pos_rate = build_windows_for_session(s.name, CFG)
    SUMMARY.append({"session": s.name, "windows": nrows, "positive_rate": round(pos_rate, 4), "path": str(out)})
    print(f"✓ Windows {s.name}: {nrows:,}  (pos_rate={pos_rate:.4f})")

pd.DataFrame(SUMMARY).to_csv(WORK/"windows_summary.csv", index=False)
print("Saved:", WORK/"windows_summary.csv")
✓ Windows PVS 1: 959  (pos_rate=0.0000)
✓ Windows PVS 2: 830  (pos_rate=0.0024)
✓ Windows PVS 3: 704  (pos_rate=0.0000)
✓ Windows PVS 4: 882  (pos_rate=0.0000)
✓ Windows PVS 5: 891  (pos_rate=0.0022)
✓ Windows PVS 6: 640  (pos_rate=0.0031)
✓ Windows PVS 7: 855  (pos_rate=0.0035)
✓ Windows PVS 8: 823  (pos_rate=0.0036)
✓ Windows PVS 9: 609  (pos_rate=0.0000)
Saved: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\windows_summary.csv

Sanity checks on prepared data¶

In [23]:
# Cleaned file check
df_clean = pd.read_parquet(CLEAN / f"{SESSIONS[0].name}.parquet")   # first session
print("Clean shape:", df_clean.shape)
have = [c for c in ["acc_mag","gyro_mag","gps_speed","speed_drop","proxy_incident"]
        if c in df_clean.columns]
print("Summary columns:", have)
if have: display(df_clean[have].describe())

# Window file check
win_files = sorted(glob.glob(str(WINDOWS/"*_windows.parquet")))
print("Window files:", len(win_files))
if win_files:
    w = pd.read_parquet(win_files[0])
    print("Windows shape:", w.shape)
    if "y" in w.columns:
        print("Class balance:", (w["y"].value_counts(normalize=True)
                                  .rename({0:"normal",1:"incident"})))
    display(w.head())
Clean shape: (72019, 34)
Summary columns: ['gps_speed', 'speed_drop', 'proxy_incident']
gps_speed speed_drop proxy_incident
count 72019.000000 72019.000000 72019.0
mean 9.556739 0.159041 0.0
std 7.746441 0.285608 0.0
min 0.002526 0.000000 0.0
25% 4.508887 0.000000 0.0
50% 6.618945 0.004228 0.0
75% 16.647470 0.225994 0.0
max 26.874480 3.895296 0.0
Window files: 9
Windows shape: (959, 169)
Class balance: y
normal    1.0
Name: proportion, dtype: float64
session t_start t_end y gps_acc_x_dashboard_mean gps_acc_x_dashboard_std gps_acc_x_dashboard_min gps_acc_x_dashboard_max gps_acc_x_dashboard_rms gps_acc_y_dashboard_mean ... speed_drop_mean speed_drop_std speed_drop_min speed_drop_max speed_drop_rms proxy_incident_mean proxy_incident_std proxy_incident_min proxy_incident_max proxy_incident_rms
0 PVS 1 2019-12-24 20:19:56.540 2019-12-24 20:19:59.520 0 0.306168 0.016296 0.270605 0.365176 0.306598 0.165220 ... 0.000099 0.000143 0.0 0.000303 0.000173 0.0 0.0 0.0 0.0 0.0
1 PVS 1 2019-12-24 20:19:58.040 2019-12-24 20:20:01.020 0 0.307486 0.026242 0.223918 0.374752 0.308596 0.163978 ... 0.000201 0.000913 0.0 0.007933 0.000932 0.0 0.0 0.0 0.0 0.0
2 PVS 1 2019-12-24 20:19:59.540 2019-12-24 20:20:02.520 0 0.306282 0.030071 0.195187 0.385526 0.307745 0.163780 ... 0.002592 0.003733 0.0 0.007933 0.004534 0.0 0.0 0.0 0.0 0.0
3 PVS 1 2019-12-24 20:20:01.040 2019-12-24 20:20:04.020 0 0.307264 0.024270 0.195187 0.385526 0.308215 0.164431 ... 0.002487 0.003691 0.0 0.007933 0.004441 0.0 0.0 0.0 0.0 0.0
4 PVS 1 2019-12-24 20:20:02.540 2019-12-24 20:20:05.520 0 0.306369 0.018382 0.263422 0.367570 0.306916 0.163311 ... 0.000186 0.000305 0.0 0.000827 0.000357 0.0 0.0 0.0 0.0 0.0

5 rows × 169 columns

STEP 4: VISUALIZATION & EDA¶

Imports & paths¶

In [26]:
import pathlib, glob, json, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Pretty plots
plt.rcParams["figure.figsize"] = (10, 4)
plt.rcParams["axes.grid"] = True

# === CHANGE THIS TO YOUR FOLDER ===
BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
WORK     = BASE_PATH / "work"
CLEAN    = WORK / "clean_resampled"
WINDOWS  = WORK / "windows"

# Load config if present
CFG_PATH = WORK / "config.json"
CFG = json.loads(CFG_PATH.read_text()) if CFG_PATH.exists() else {
    "resample_hz": 50, "window_sec": 3.0, "overlap": 0.5
}

# Utility: safe column grab
def cols_exist(df, cols):
    return [c for c in cols if c in df.columns]

Inventory¶

In [27]:
proc_idx = pd.read_csv(WORK / "processed_sessions.csv")
win_sum  = pd.read_csv(WORK / "windows_summary.csv")

print("Cleaned sessions:", len(proc_idx))
display(proc_idx.head())

print("\nWindow files:", len(win_sum))
display(win_sum.head())

print("\nTotals:")
print("  Total cleaned rows:", f"{proc_idx['rows'].sum():,}")
print("  Total windows:", f"{win_sum['windows'].sum():,}")
print("  Mean positive rate:", round(win_sum['positive_rate'].mean(), 4))
Cleaned sessions: 9
session rows path
0 PVS 1 72019 C:\Users\sn161663\Desktop\Accident_Detection_P...
1 PVS 2 62343 C:\Users\sn161663\Desktop\Accident_Detection_P...
2 PVS 3 52908 C:\Users\sn161663\Desktop\Accident_Detection_P...
3 PVS 4 66246 C:\Users\sn161663\Desktop\Accident_Detection_P...
4 PVS 5 66939 C:\Users\sn161663\Desktop\Accident_Detection_P...
Window files: 9
session windows positive_rate path
0 PVS 1 959 0.0000 C:\Users\sn161663\Desktop\Accident_Detection_P...
1 PVS 2 830 0.0024 C:\Users\sn161663\Desktop\Accident_Detection_P...
2 PVS 3 704 0.0000 C:\Users\sn161663\Desktop\Accident_Detection_P...
3 PVS 4 882 0.0000 C:\Users\sn161663\Desktop\Accident_Detection_P...
4 PVS 5 891 0.0022 C:\Users\sn161663\Desktop\Accident_Detection_P...
Totals:
  Total cleaned rows: 540,458
  Total windows: 7,193
  Mean positive rate: 0.0016

Peek at one cleaned session¶

In [28]:
# Choose first session that exists
first_path = CLEAN / (proc_idx.loc[0, "session"] + ".parquet")
df = pd.read_parquet(first_path)

print("Shape:", df.shape)
print("\nColumns:", list(df.columns)[:30], "...\n")

print(df.info())
display(df.head(5))

# Basic stats for commonly-used columns (only those that exist)
want = ["acc_ax","acc_ay","acc_az","gyro_gx","gyro_gy","gyro_gz",
        "gps_speed","acc_mag","gyro_mag","speed_drop","proxy_incident"]
have = cols_exist(df, want)
if have:
    display(df[have].describe().T)
else:
    print("No standard numeric columns found in this session.")
Shape: (72019, 34)

Columns: ['gps_acc_x_dashboard', 'gps_acc_y_dashboard', 'gps_acc_z_dashboard', 'gps_acc_x_above_suspension', 'gps_acc_y_above_suspension', 'gps_acc_z_above_suspension', 'gps_acc_x_below_suspension', 'gps_acc_y_below_suspension', 'gps_acc_z_below_suspension', 'gps_gyro_x_dashboard', 'gps_gyro_y_dashboard', 'gps_gyro_z_dashboard', 'gps_gyro_x_above_suspension', 'gps_gyro_y_above_suspension', 'gps_gyro_z_above_suspension', 'gps_gyro_x_below_suspension', 'gps_gyro_y_below_suspension', 'gps_gyro_z_below_suspension', 'gps_mag_x_dashboard', 'gps_mag_y_dashboard', 'gps_mag_z_dashboard', 'gps_mag_x_above_suspension', 'gps_mag_y_above_suspension', 'gps_mag_z_above_suspension', 'gps_temp_dashboard', 'gps_temp_above_suspension', 'gps_temp_below_suspension', 'gps_timestamp_gps', 'gps_latitude', 'gps_longitude'] ...

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 72019 entries, 2019-12-24 20:19:56.540000 to 2019-12-24 20:43:56.900000
Data columns (total 34 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gps_acc_x_dashboard          72019 non-null  float64
 1   gps_acc_y_dashboard          72019 non-null  float64
 2   gps_acc_z_dashboard          72019 non-null  float64
 3   gps_acc_x_above_suspension   72019 non-null  float64
 4   gps_acc_y_above_suspension   72019 non-null  float64
 5   gps_acc_z_above_suspension   72019 non-null  float64
 6   gps_acc_x_below_suspension   72019 non-null  float64
 7   gps_acc_y_below_suspension   72019 non-null  float64
 8   gps_acc_z_below_suspension   72019 non-null  float64
 9   gps_gyro_x_dashboard         72019 non-null  float64
 10  gps_gyro_y_dashboard         72019 non-null  float64
 11  gps_gyro_z_dashboard         72019 non-null  float64
 12  gps_gyro_x_above_suspension  72019 non-null  float64
 13  gps_gyro_y_above_suspension  72019 non-null  float64
 14  gps_gyro_z_above_suspension  72019 non-null  float64
 15  gps_gyro_x_below_suspension  72019 non-null  float64
 16  gps_gyro_y_below_suspension  72019 non-null  float64
 17  gps_gyro_z_below_suspension  72019 non-null  float64
 18  gps_mag_x_dashboard          72019 non-null  float64
 19  gps_mag_y_dashboard          72019 non-null  float64
 20  gps_mag_z_dashboard          72019 non-null  float64
 21  gps_mag_x_above_suspension   72019 non-null  float64
 22  gps_mag_y_above_suspension   72019 non-null  float64
 23  gps_mag_z_above_suspension   72019 non-null  float64
 24  gps_temp_dashboard           72019 non-null  float64
 25  gps_temp_above_suspension    72019 non-null  float64
 26  gps_temp_below_suspension    72019 non-null  float64
 27  gps_timestamp_gps            72019 non-null  float64
 28  gps_latitude                 72019 non-null  float64
 29  gps_longitude                72019 non-null  float64
 30  gps_speed                    72019 non-null  float64
 31  session                      72019 non-null  object 
 32  speed_drop                   72019 non-null  float64
 33  proxy_incident               72019 non-null  int64  
dtypes: float64(32), int64(1), object(1)
memory usage: 19.2+ MB
None
gps_acc_x_dashboard gps_acc_y_dashboard gps_acc_z_dashboard gps_acc_x_above_suspension gps_acc_y_above_suspension gps_acc_z_above_suspension gps_acc_x_below_suspension gps_acc_y_below_suspension gps_acc_z_below_suspension gps_gyro_x_dashboard ... gps_temp_dashboard gps_temp_above_suspension gps_temp_below_suspension gps_timestamp_gps gps_latitude gps_longitude gps_speed session speed_drop proxy_incident
timestamp
2019-12-24 20:19:56.540 0.314897 0.187227 9.863572 0.314750 0.166426 9.808869 0.529819 0.097111 9.930623 0.221062 ... 34.274628 34.035014 31.926408 1.577219e+09 -27.717841 -51.098865 0.009128 PVS 1 0.0 0
2019-12-24 20:19:56.560 0.297539 0.187227 9.869558 0.313553 0.154455 9.855556 0.525031 0.100702 9.948579 0.045586 ... 34.358493 34.082936 31.734717 1.577219e+09 -27.717841 -51.098865 0.009128 PVS 1 0.0 0
2019-12-24 20:19:56.580 0.308912 0.199198 9.842024 0.332706 0.159244 9.831614 0.533411 0.092323 9.887527 0.175285 ... 34.370474 33.939168 31.447180 1.577219e+09 -27.717841 -51.098865 0.009128 PVS 1 0.0 0
2019-12-24 20:19:56.600 0.317292 0.157299 9.859981 0.297991 0.156849 9.824431 0.498695 0.404766 9.923440 0.205803 ... 34.514242 33.963129 31.447180 1.577219e+09 -27.717841 -51.098865 0.009128 PVS 1 0.0 0
2019-12-24 20:19:56.620 0.295744 0.148919 9.885120 0.266866 0.156849 9.835205 0.496300 0.097111 9.856403 -0.206184 ... 34.370474 34.082936 31.638871 1.577219e+09 -27.717841 -51.098865 0.009128 PVS 1 0.0 0

5 rows × 34 columns

count mean std min 25% 50% 75% max
gps_speed 72019.0 9.556739 7.746441 0.002526 4.508887 6.618945 16.647470 26.874480
speed_drop 72019.0 0.159041 0.285608 0.000000 0.000000 0.004228 0.225994 3.895296
proxy_incident 72019.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

Missing values profile (bar chart)¶

In [29]:
na_pct = df.isna().mean().sort_values(ascending=False) * 100
na_top = na_pct[na_pct > 0].head(25)

if not na_top.empty:
    plt.figure(figsize=(10, 6))
    na_top.sort_values().plot(kind="barh")
    plt.title("Top columns with missing values (%)")
    plt.xlabel("% missing")
    plt.tight_layout(); plt.show()
else:
    print("No missing values in this session.")
No missing values in this session.

Distributions¶

In [30]:
# Pick some numeric columns that actually exist
num_candidates = cols_exist(df, ["acc_ax","acc_ay","acc_az",
                                "gyro_gx","gyro_gy","gyro_gz",
                                "gps_speed","acc_mag","gyro_mag","speed_drop"])

for c in num_candidates[:6]:
    plt.figure()
    df[c].dropna().hist(bins=60)
    plt.title(f"Distribution: {c}")
    plt.xlabel(c); plt.ylabel("Frequency")
    plt.show()

# Proxy label distribution
if "proxy_incident" in df.columns:
    vc = df["proxy_incident"].value_counts()
    print("Proxy incident counts:\n", vc)
    vc.plot(kind="bar")
    plt.title("Proxy incident distribution (per timestamp)")
    plt.xticks(rotation=0); plt.show()
No description has been provided for this image
No description has been provided for this image
Proxy incident counts:
 proxy_incident
0    72019
Name: count, dtype: int64
No description has been provided for this image

Short time-series preview¶

In [31]:
fs = CFG["resample_hz"]
seg_samples = 60 * fs
seg = df.iloc[:seg_samples].copy()

has_acc  = cols_exist(seg, ["acc_ax","acc_ay","acc_az"])
has_gyro = cols_exist(seg, ["gyro_gx","gyro_gy","gyro_gz"])
has_speed= cols_exist(seg, ["gps_speed"])

# Plot accelerometer
if has_acc:
    seg[has_acc].plot()
    plt.title("Accelerometer (first 60s)")
    plt.show()

# Plot gyroscope
if has_gyro:
    seg[has_gyro].plot()
    plt.title("Gyroscope (first 60s)")
    plt.show()

# Plot speed & proxy label markers
if has_speed:
    ax = seg["gps_speed"].plot(label="gps_speed")
    if "proxy_incident" in seg.columns and seg["proxy_incident"].sum() > 0:
        where = seg["proxy_incident"] == 1
        ax.scatter(seg.index[where], seg.loc[where,"gps_speed"],
                   marker="x", s=30, label="proxy_incident")
    ax.set_title("GPS speed (first 60s) + proxy events")
    ax.legend(); plt.show()
No description has been provided for this image

Correlation heatmap¶

In [32]:
num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
# Limit to a manageable number for plotting (top 25 by variance)
if len(num_cols) > 25:
    var = df[num_cols].var().sort_values(ascending=False)
    num_cols = var.head(25).index.tolist()

corr = df[num_cols].corr()
plt.figure(figsize=(8, 6))
plt.imshow(corr, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation (numeric subset)")
plt.colorbar(shrink=0.8)
plt.xticks(range(len(num_cols)), num_cols, rotation=90)
plt.yticks(range(len(num_cols)), num_cols)
plt.tight_layout(); plt.show()
No description has been provided for this image

Session-level label rate¶

In [33]:
win_sum = pd.read_csv(WORK / "windows_summary.csv")
win_sum = win_sum.sort_values("positive_rate", ascending=False)
ax = win_sum.plot(x="session", y="positive_rate", kind="bar", legend=False)
ax.set_title("Positive window rate by session")
ax.set_ylabel("Positive rate"); ax.set_xlabel("Session")
plt.xticks(rotation=45, ha="right"); plt.tight_layout(); plt.show()

print("Overall positive rate:",
      round((win_sum["positive_rate"]*win_sum["windows"]).sum() / win_sum["windows"].sum(), 4))
No description has been provided for this image
Overall positive rate: 0.0017

Windows feature exploration (y=0 vs y=1)¶

In [34]:
# Read a sample of window files (to avoid huge memory)
win_files = sorted(glob.glob(str(WINDOWS / "*_windows.parquet")))
frames = []
for p in win_files[:6]:  # adjust if you want more
    f = pd.read_parquet(p)
    if len(f) > 6000:
        f = f.sample(6000, random_state=42)
    frames.append(f)
W = pd.concat(frames, ignore_index=True)
print("Windows sample shape:", W.shape)
print(W["y"].value_counts())

# Choose informative features (keep it small)
cands = [c for c in W.columns if any(k in c for k in ["acc_mag","gyro_mag","gps_speed","speed_drop"]) and c.endswith(("_mean","_max","_std","_rms"))]
top = cands[:8] if len(cands) > 8 else cands
print("Using features:", top)

# Boxplots: y=0 vs y=1 for each chosen feature
import math
r = math.ceil(len(top)/2)
fig, axes = plt.subplots(r, 2, figsize=(12, 4*r))
axes = axes.flatten()
for i, c in enumerate(top):
    try:
        W.boxplot(column=c, by="y", ax=axes[i])
        axes[i].set_title(c); axes[i].set_xlabel("y"); axes[i].set_ylabel(c)
    except Exception as e:
        axes[i].set_visible(False)
plt.suptitle("Feature distributions by class (windows)"); plt.tight_layout(); plt.show()
Windows sample shape: (4906, 169)
y
0    4900
1       6
Name: count, dtype: int64
Using features: ['gps_speed_mean', 'gps_speed_std', 'gps_speed_max', 'gps_speed_rms', 'speed_drop_mean', 'speed_drop_std', 'speed_drop_max', 'speed_drop_rms']
No description has been provided for this image

Outliner scan (z-score count)¶

In [35]:
def z_outliers(x, z=4.0):
    m, s = np.nanmean(x), np.nanstd(x)
    if s == 0 or np.isnan(s): return 0
    return int(np.sum(np.abs((x - m)/s) > z))

if num_cols:
    outlier_counts = {c: z_outliers(df[c].values) for c in num_cols}
    outlier_df = (pd.Series(outlier_counts)
                    .sort_values(ascending=False)
                    .head(15)
                    .rename("outliers (>4σ)"))
    display(outlier_df)
else:
    print("No numeric columns for outlier scan.")
gps_gyro_z_dashboard           1284
gps_gyro_z_above_suspension    1261
gps_acc_y_below_suspension      408
gps_acc_z_below_suspension      368
gps_gyro_z_below_suspension     354
gps_gyro_x_above_suspension     341
gps_gyro_x_below_suspension     294
gps_acc_x_below_suspension      279
gps_gyro_y_above_suspension     257
gps_gyro_x_dashboard            256
gps_gyro_y_dashboard            252
gps_gyro_y_below_suspension     227
gps_acc_y_dashboard             217
gps_acc_z_dashboard             209
gps_acc_z_above_suspension      178
Name: outliers (>4σ), dtype: int64

Save a compact EDA report¶

In [36]:
REPORTS = WORK / "eda_reports"
REPORTS.mkdir(exist_ok=True)

# 1) Column summary (cleaned session)
col_summary = pd.DataFrame({
    "dtype": df.dtypes.astype(str),
    "missing_pct": df.isna().mean()*100,
    "nonzero_pct": (df.fillna(0) != 0).mean()*100,
})
col_summary.to_csv(REPORTS / "cleaned_column_summary.csv")

# 2) Windows class summary
win_class = W["y"].value_counts(normalize=True).rename_axis("y").rename("pct").reset_index()
win_class.to_csv(REPORTS / "windows_class_distribution.csv", index=False)

# 3) Per-session positive rate (already exists but replicate)
win_sum.to_csv(REPORTS / "per_session_positive_rate.csv", index=False)

print("Saved EDA CSVs to:", REPORTS)
Saved EDA CSVs to: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports

EDA-Map: Quick GPS scapper map¶

In [37]:
try:
    import folium
    gps_cols = cols_exist(df, ["gps_latitude","gps_longitude"]) or cols_exist(df, ["latitude","longitude"])
    if gps_cols:
        lat_col, lon_col = gps_cols[0], gps_cols[1]
        gps = df[[lat_col, lon_col]].dropna().sample(min(3000, len(df)), random_state=42)
        center = [gps[lat_col].mean(), gps[lon_col].mean()]
        m = folium.Map(location=center, zoom_start=13)
        for _, r in gps.iterrows():
            folium.CircleMarker([r[lat_col], r[lon_col]], radius=1).add_to(m)
        display(m)
    else:
        print("No GPS columns found for mapping.")
except Exception as e:
    print("Map skipped:", e)
Make this Notebook Trusted to load map: File -> Trust Notebook

STEP 5: MODELING¶

Setup & imports¶

In [39]:
# Core
import pathlib, glob, json, os, gc, warnings
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

# ML
from sklearn.model_selection import train_test_split, GroupKFold, StratifiedGroupKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support,
                             roc_auc_score, average_precision_score,
                             confusion_matrix, classification_report)
from sklearn.utils.class_weight import compute_class_weight

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Plotting
import matplotlib.pyplot as plt

# === CHANGE THIS TO YOUR PROJECT ROOT ===
BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
WORK     = BASE_PATH / "work"
WINDOWS  = WORK / "windows"
MODEL_DIR= WORK / "models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Load config (for reproducibility notes)
CFG = json.loads((WORK/"config.json").read_text()) if (WORK/"config.json").exists() else {}
print("Models will be saved to:", MODEL_DIR)
Models will be saved to: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\models

Load & merge window datasets¶

In [40]:
win_files = sorted(glob.glob(str(WINDOWS / "*_windows.parquet")))
assert win_files, "No window files found. Run Step 3 windowing first."

frames = []
for p in win_files:
    f = pd.read_parquet(p)
    frames.append(f)
W = pd.concat(frames, ignore_index=True)
print("Windows shape:", W.shape)
print("Class balance:", W["y"].value_counts(normalize=True).rename({0:"normal",1:"incident"}))
print("Sessions:", W["session"].nunique())
display(W.head(3))
Windows shape: (7193, 169)
Class balance: y
normal      0.998332
incident    0.001668
Name: proportion, dtype: float64
Sessions: 9
session t_start t_end y gps_acc_x_dashboard_mean gps_acc_x_dashboard_std gps_acc_x_dashboard_min gps_acc_x_dashboard_max gps_acc_x_dashboard_rms gps_acc_y_dashboard_mean ... speed_drop_mean speed_drop_std speed_drop_min speed_drop_max speed_drop_rms proxy_incident_mean proxy_incident_std proxy_incident_min proxy_incident_max proxy_incident_rms
0 PVS 1 2019-12-24 20:19:56.540 2019-12-24 20:19:59.520 0 0.306168 0.016296 0.270605 0.365176 0.306598 0.165220 ... 0.000099 0.000143 0.0 0.000303 0.000173 0.0 0.0 0.0 0.0 0.0
1 PVS 1 2019-12-24 20:19:58.040 2019-12-24 20:20:01.020 0 0.307486 0.026242 0.223918 0.374752 0.308596 0.163978 ... 0.000201 0.000913 0.0 0.007933 0.000932 0.0 0.0 0.0 0.0 0.0
2 PVS 1 2019-12-24 20:19:59.540 2019-12-24 20:20:02.520 0 0.306282 0.030071 0.195187 0.385526 0.307745 0.163780 ... 0.002592 0.003733 0.0 0.007933 0.004534 0.0 0.0 0.0 0.0 0.0

3 rows × 169 columns

Split Train /Validation/ Test by session¶

In [41]:
rng = np.random.RandomState(42)

# Unique sessions
sessions = W["session"].unique()
rng.shuffle(sessions)

# 70/15/15 split by session count
n = len(sessions)
n_train = int(0.70*n)
n_valid = int(0.15*n)
train_s = set(sessions[:n_train])
valid_s = set(sessions[n_train:n_train+n_valid])
test_s  = set(sessions[n_train+n_valid:])

def mask(sset): return W["session"].isin(sset)

train_df = W[mask(train_s)].reset_index(drop=True)
valid_df = W[mask(valid_s)].reset_index(drop=True)
test_df  = W[mask(test_s)].reset_index(drop=True)

print("Sessions -> train/valid/test:", len(train_s), len(valid_s), len(test_s))
print("Rows    ->", len(train_df), len(valid_df), len(test_df))

# Build feature list (numeric only, drop obvious non-features)
drop_cols = {"session","t_start","t_end","y"}
feat_cols = [c for c in W.columns
             if c not in drop_cols and pd.api.types.is_numeric_dtype(W[c])]

X_train, y_train, g_train = train_df[feat_cols], train_df["y"], train_df["session"]
X_valid, y_valid, g_valid = valid_df[feat_cols], valid_df["y"], valid_df["session"]
X_test,  y_test,  g_test  = test_df [feat_cols], test_df ["y"], test_df ["session"]

print("Feature count:", len(feat_cols))
Sessions -> train/valid/test: 6 1 2
Rows    -> 4565 891 1737
Feature count: 165

Class weights & evaluation helper¶

In [42]:
# Class weights from training set
classes = np.array([0,1])
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weight = {int(k): float(v) for k, v in zip(classes, cw)}
print("Class weights:", class_weight)

def evaluate(model, X, y, title=""):
    proba_ok = hasattr(model, "predict_proba")
    if proba_ok:
        y_proba = model.predict_proba(X)[:,1]
    else:
        # decision_function fallback (SVM)
        if hasattr(model, "decision_function"):
            s = model.decision_function(X)
            y_proba = (s - s.min()) / (s.max()-s.min() + 1e-9)
        else:
            y_proba = None

    y_hat = model.predict(X)
    acc = accuracy_score(y, y_hat)
    p, r, f1, _ = precision_recall_fscore_support(y, y_hat, average="binary", zero_division=0)
    ap = average_precision_score(y, y_proba) if y_proba is not None else np.nan
    auc = roc_auc_score(y, y_proba) if y_proba is not None else np.nan

    print(f"\n== {title} ==")
    print(f"Accuracy: {acc:.4f} | Precision: {p:.4f} | Recall: {r:.4f} | F1: {f1:.4f}")
    print(f"ROC-AUC: {auc:.4f} | PR-AUC (AP): {ap:.4f}")

    # Confusion matrix
    cm = confusion_matrix(y, y_hat, labels=[0,1])
    fig, ax = plt.subplots(figsize=(3.5,3))
    im = ax.imshow(cm, cmap="Blues")
    ax.set_title(f"Confusion Matrix: {title}")
    ax.set_xticks([0,1]); ax.set_yticks([0,1])
    ax.set_xticklabels(["0","1"]); ax.set_yticklabels(["0","1"])
    for (i,j), v in np.ndenumerate(cm):
        ax.text(j, i, str(v), ha="center", va="center", color="black")
    ax.set_xlabel("Predicted"); ax.set_ylabel("True")
    plt.colorbar(im, fraction=0.046, pad=0.04); plt.tight_layout(); plt.show()

    return {"acc":acc, "prec":p, "rec":r, "f1":f1, "auc":auc, "ap":ap}
Class weights: {0: 0.5007678806494076, 1: 326.07142857142856}

Preprocessing pipeline¶

In [43]:
base_steps = [
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  StandardScaler(with_mean=True, with_std=True))
]

Train baseline models¶

In [44]:
results = {}

# 1) Logistic Regression (liblinear for small/imbalanced)
lr = Pipeline(base_steps + [
    ("clf", LogisticRegression(max_iter=500, class_weight=class_weight, solver="liblinear"))
])
lr.fit(X_train, y_train)
results["LogReg_valid"] = evaluate(lr, X_valid, y_valid, "LogReg (valid)")

# 2) Random Forest
rf = Pipeline(base_steps + [
    ("clf", RandomForestClassifier(n_estimators=300, max_depth=None,
                                  class_weight=class_weight, random_state=42, n_jobs=-1))
])
rf.fit(X_train, y_train)
results["RF_valid"] = evaluate(rf, X_valid, y_valid, "RandomForest (valid)")

# 3) Gradient Boosting
gb = Pipeline(base_steps + [
    ("clf", GradientBoostingClassifier(random_state=42))
])
gb.fit(X_train, y_train)
results["GB_valid"] = evaluate(gb, X_valid, y_valid, "GradientBoosting (valid)")

# 4) SVM (RBF)
svm = Pipeline(base_steps + [
    ("clf", SVC(kernel="rbf", C=1.0, gamma="scale", class_weight=class_weight, probability=True, random_state=42))
])
svm.fit(X_train, y_train)
results["SVM_valid"] = evaluate(svm, X_valid, y_valid, "SVM RBF (valid)")

pd.DataFrame(results).T
== LogReg (valid) ==
Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000
ROC-AUC: 1.0000 | PR-AUC (AP): 1.0000
No description has been provided for this image
== RandomForest (valid) ==
Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000
ROC-AUC: 1.0000 | PR-AUC (AP): 1.0000
No description has been provided for this image
== GradientBoosting (valid) ==
Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000
ROC-AUC: 1.0000 | PR-AUC (AP): 1.0000
No description has been provided for this image
== SVM RBF (valid) ==
Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000
ROC-AUC: 1.0000 | PR-AUC (AP): 1.0000
No description has been provided for this image
Out[44]:
acc prec rec f1 auc ap
LogReg_valid 1.0 1.0 1.0 1.0 1.0 1.0
RF_valid 1.0 1.0 1.0 1.0 1.0 1.0
GB_valid 1.0 1.0 1.0 1.0 1.0 1.0
SVM_valid 1.0 1.0 1.0 1.0 1.0 1.0

Light hyper-parameter tuning¶

In [45]:
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

def gridsearch(pipe, grid, name):
    gs = GridSearchCV(pipe, grid, scoring="average_precision", cv=cv, n_jobs=-1, verbose=0)
    gs.fit(X_train, y_train, groups=g_train)
    print(f"\n[{name}] best AP (cv): {gs.best_score_:.4f}")
    print("Best params:", gs.best_params_)
    # Evaluate on validation
    best = gs.best_estimator_
    evaluate(best, X_valid, y_valid, f"{name} (VALID)")
    return best

# Logistic Regression grid
lr_grid = {
    "clf__C": [0.1, 1.0, 3.0],
    "clf__penalty": ["l1","l2"],
    "clf__solver": ["liblinear"]
}
lr_best = gridsearch(
    Pipeline(base_steps + [("clf", LogisticRegression(max_iter=800, class_weight=class_weight))]),
    lr_grid, "LogReg"
)

# Random Forest grid
rf_grid = {
    "clf__n_estimators": [300, 600],
    "clf__max_depth": [None, 12, 18],
    "clf__min_samples_leaf": [1, 3]
}
rf_best = gridsearch(
    Pipeline(base_steps + [("clf", RandomForestClassifier(class_weight=class_weight, random_state=42, n_jobs=-1))]),
    rf_grid, "RandomForest"
)

# Gradient Boosting grid
gb_grid = {
    "clf__n_estimators": [150, 300],
    "clf__learning_rate": [0.05, 0.1],
    "clf__max_depth": [2, 3]
}
gb_best = gridsearch(
    Pipeline(base_steps + [("clf", GradientBoostingClassifier(random_state=42))]),
    gb_grid, "GradientBoosting"
)

# SVM grid (RBF)
svm_grid = {
    "clf__C": [0.5, 1.0, 2.0],
    "clf__gamma": ["scale", 0.1, 0.01]
}
svm_best = gridsearch(
    Pipeline(base_steps + [("clf", SVC(kernel="rbf", class_weight=class_weight, probability=True, random_state=42))]),
    svm_grid, "SVM RBF"
)
[LogReg] best AP (cv): 0.6000
Best params: {'clf__C': 0.1, 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}

== LogReg (VALID) ==
Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000
ROC-AUC: 1.0000 | PR-AUC (AP): 1.0000
No description has been provided for this image
[RandomForest] best AP (cv): 0.6000
Best params: {'clf__max_depth': None, 'clf__min_samples_leaf': 1, 'clf__n_estimators': 300}

== RandomForest (VALID) ==
Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000
ROC-AUC: 1.0000 | PR-AUC (AP): 1.0000
No description has been provided for this image
[GradientBoosting] best AP (cv): 0.6000
Best params: {'clf__learning_rate': 0.05, 'clf__max_depth': 2, 'clf__n_estimators': 150}

== GradientBoosting (VALID) ==
Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000
ROC-AUC: 1.0000 | PR-AUC (AP): 1.0000
No description has been provided for this image
[SVM RBF] best AP (cv): 0.6000
Best params: {'clf__C': 0.5, 'clf__gamma': 'scale'}

== SVM RBF (VALID) ==
Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000
ROC-AUC: 1.0000 | PR-AUC (AP): 1.0000
No description has been provided for this image

Train + Valid¶

In [46]:
# Manually pick based on your VALID metrics (edit this line)
best_model = rf_best   # e.g., rf_best / gb_best / lr_best / svm_best

# Refit on Train+Valid
X_trv = pd.concat([X_train, X_valid], axis=0)
y_trv = pd.concat([y_train, y_valid], axis=0)
best_model.fit(X_trv, y_trv)

# Final test evaluation
test_metrics = evaluate(best_model, X_test, y_test, "FINAL (TEST)")
test_metrics
== FINAL (TEST) ==
Accuracy: 0.9983 | Precision: 0.0000 | Recall: 0.0000 | F1: 0.0000
ROC-AUC: 1.0000 | PR-AUC (AP): 1.0000
No description has been provided for this image
Out[46]:
{'acc': 0.998272884283247,
 'prec': 0.0,
 'rec': 0.0,
 'f1': 0.0,
 'auc': np.float64(1.0),
 'ap': np.float64(1.0)}

Save model & metadata¶

In [47]:
import joblib, time
stamp = time.strftime("%Y%m%d_%H%M%S")
model_path = MODEL_DIR / f"best_model_{stamp}.joblib"
joblib.dump(best_model, model_path)

meta = {
    "created": stamp,
    "features": feat_cols,
    "config": CFG,
    "train_sessions": sorted(list(set(g_train))),
    "valid_sessions": sorted(list(set(g_valid))),
    "test_sessions":  sorted(list(set(g_test))),
    "test_metrics": test_metrics,
    "class_weight": class_weight
}
json.dump(meta, open(MODEL_DIR / f"best_model_{stamp}.json", "w"), indent=2)
print("Saved:", model_path)
Saved: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\models\best_model_20251110_180118.joblib

To Show how to load and predict on a new batch of windows¶

In [48]:
# Load
best_model = joblib.load(model_path)

# Example: predict on test set (or any new window table with same features)
proba = best_model.predict_proba(X_test)[:,1]
yhat  = (proba >= 0.5).astype(int)

print("Sample predictions:", yhat[:10])
print("Sample probabilities:", np.round(proba[:10], 3))
Sample predictions: [0 0 0 0 0 0 0 0 0 0]
Sample probabilities: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

STEP 6: EVALUATION¶

Reloding the model and testing the split¶

In [49]:
import pathlib, json, joblib, pandas as pd
from sklearn.metrics import confusion_matrix

# === CHANGE to your project root if needed ===
BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
WORK      = BASE_PATH / "work"
MODEL_DIR = WORK / "models"
WINDOWS   = WORK / "windows"

# Load the latest saved model & metadata
model_path = sorted(MODEL_DIR.glob("best_model_*.joblib"))[-1]
meta_path  = model_path.with_suffix(".json")
best_model = joblib.load(model_path)
meta       = json.load(open(meta_path))

feat_cols     = meta["features"]
test_sessions = meta["test_sessions"]

# Rebuild test set from window files
import glob
frames = []
for p in sorted(glob.glob(str(WINDOWS/"*_windows.parquet"))):
    df = pd.read_parquet(p)
    if df["session"].iloc[0] in test_sessions:
        frames.append(df)
test_df = pd.concat(frames, ignore_index=True)

X_test  = test_df[feat_cols]
y_test  = test_df["y"]
g_test  = test_df["session"]
print("Loaded model:", model_path.name, "| Test rows:", len(test_df))
Loaded model: best_model_20251110_180118.joblib | Test rows: 1737

For computing metrics + confusion matrix¶

In [50]:
import numpy as np, matplotlib.pyplot as plt
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support,
                             roc_auc_score, average_precision_score,
                             ConfusionMatrixDisplay)

def evaluate(model, X, y, title=""):
    proba = (model.predict_proba(X)[:,1]
             if hasattr(model, "predict_proba")
             else model.decision_function(X))
    # Ensure [0,1]
    if proba.min() < 0 or proba.max() > 1:
        proba = (proba - proba.min())/(proba.max()-proba.min() + 1e-9)

    yhat = (proba >= 0.5).astype(int)
    acc = accuracy_score(y, yhat)
    p, r, f1, _ = precision_recall_fscore_support(y, yhat, average="binary", zero_division=0)
    auc = roc_auc_score(y, proba)
    ap  = average_precision_score(y, proba)

    print(f"\n== {title} ==")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {p:.4f} | Recall: {r:.4f} | F1: {f1:.4f}")
    print(f"ROC-AUC  : {auc:.4f} | PR-AUC (AP): {ap:.4f}")

    ConfusionMatrixDisplay.from_predictions(y, yhat, labels=[0,1], cmap="Blues")
    plt.title(f"Confusion Matrix – {title} (thr=0.50)")
    plt.tight_layout(); plt.show()

    return dict(acc=acc, prec=p, rec=r, f1=f1, auc=auc, ap=ap, proba=proba, yhat=yhat)

Global metrics on Valid and Test¶

In [51]:
metrics_test = evaluate(best_model, X_test, y_test, "TEST")
== TEST ==
Accuracy : 0.9983
Precision: 0.0000 | Recall: 0.0000 | F1: 0.0000
ROC-AUC  : 1.0000 | PR-AUC (AP): 1.0000
No description has been provided for this image

curves (ROC & Precision-Recall) on TEST¶

In [52]:
from sklearn.metrics import roc_curve, precision_recall_curve, auc

proba = metrics_test["proba"]
fpr, tpr, _ = roc_curve(y_test, proba)
prec, rec, _ = precision_recall_curve(y_test, proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(5,4))
plt.plot(fpr, tpr, lw=2)
plt.plot([0,1],[0,1],'--',lw=1, color='grey')
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title(f"ROC Curve (AUC={roc_auc:.3f}) – TEST"); plt.grid(True); plt.tight_layout(); plt.show()

plt.figure(figsize=(5,4))
plt.plot(rec, prec, lw=2)
plt.xlabel("Recall"); plt.ylabel("Precision")
plt.title(f"Precision–Recall (AP={metrics_test['ap']:.3f}) – TEST")
plt.grid(True); plt.tight_layout(); plt.show()
No description has been provided for this image
No description has been provided for this image

Threshold tuning (optimize F1 or Recall>=X)¶

In [53]:
def find_best_threshold(y, proba, min_recall=None):
    prec, rec, thr = precision_recall_curve(y, proba)
    # Skip last duplicate threshold
    thr = np.r_[thr, 1.0]
    f1 = 2*prec*rec/(prec+rec+1e-9)
    if min_recall is not None:
        ok = rec >= min_recall
        if ok.any():
            i = np.argmax(f1 * ok)  # best F1 among those meeting recall
        else:
            i = np.argmax(f1)      # fallback
    else:
        i = np.argmax(f1)
    return float(thr[i]), float(prec[i]), float(rec[i]), float(f1[i])

best_thr, p_at, r_at, f1_at = find_best_threshold(y_test, proba, min_recall=0.85)  # change or set to None
print(f"Chosen threshold: {best_thr:.3f} | Prec={p_at:.3f} Recall={r_at:.3f} F1={f1_at:.3f}")

yhat_tuned = (proba >= best_thr).astype(int)
ConfusionMatrixDisplay.from_predictions(y_test, yhat_tuned, labels=[0,1], cmap="Blues")
plt.title(f"Confusion Matrix – TEST (thr={best_thr:.2f})"); plt.tight_layout(); plt.show()
Chosen threshold: 0.143 | Prec=1.000 Recall=1.000 F1=1.000
No description has been provided for this image

Per-session metrics (reliability by trip)¶

In [54]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

df_eval = pd.DataFrame({"session": g_test, "y": y_test, "proba": proba})
df_eval["yhat"] = (df_eval["proba"] >= best_thr).astype(int)

rows = []
for s, grp in df_eval.groupby("session"):
    p, r, f1, _ = precision_recall_fscore_support(grp["y"], grp["yhat"], average="binary", zero_division=0)
    acc = accuracy_score(grp["y"], grp["yhat"])
    rows.append({"session": s, "n": len(grp), "acc":acc, "prec":p, "rec":r, "f1":f1})
per_session = pd.DataFrame(rows).sort_values("f1", ascending=False)
display(per_session.head(10))

ax = per_session.plot(x="session", y=["f1","rec","prec"], kind="bar", figsize=(10,4))
ax.set_title("Per-session metrics (TEST)"); ax.set_ylabel("Score"); plt.xticks(rotation=45, ha="right")
plt.tight_layout(); plt.show()
session n acc prec rec f1
1 PVS 7 855 1.0 1.0 1.0 1.0
0 PVS 4 882 1.0 0.0 0.0 0.0
No description has been provided for this image

Calibration check (reliability curve & brier score)¶

In [55]:
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss

prob_true, prob_pred = calibration_curve(y_test, proba, n_bins=10, strategy="quantile")

plt.figure(figsize=(5,4))
plt.plot(prob_pred, prob_true, marker='o')
plt.plot([0,1],[0,1],'--', color='grey')
plt.xlabel("Predicted probability"); plt.ylabel("Observed frequency")
plt.title("Reliability curve – TEST"); plt.grid(True); plt.tight_layout(); plt.show()

print("Brier score (lower is better):", round(brier_score_loss(y_test, proba), 4))
No description has been provided for this image
Brier score (lower is better): 0.0007

Feature Importance¶

In [56]:
import numpy as np
import matplotlib.pyplot as plt

def plot_importance(model, feature_names, top_k=20, title="Feature importance"):
    if hasattr(model.named_steps["clf"], "feature_importances_"):
        imp = model.named_steps["clf"].feature_importances_
        names = np.array(feature_names)
        idx = np.argsort(imp)[::-1][:top_k]
        plt.figure(figsize=(8,6))
        plt.barh(range(len(idx)), imp[idx][::-1])
        plt.yticks(range(len(idx)), names[idx][::-1], fontsize=9)
        plt.title(title); plt.tight_layout(); plt.show()
        return pd.DataFrame({"feature": names[idx], "importance": imp[idx]})
    else:
        print("Tree-based importances not available; consider permutation importance below.")

imp_df = plot_importance(best_model, feat_cols, top_k=20, title="Top features (tree model)")
No description has been provided for this image
In [58]:
from sklearn.inspection import permutation_importance
pi = permutation_importance(best_model, X_test, y_test, n_repeats=5, random_state=42, scoring="average_precision")
idx = np.argsort(pi.importances_mean)[::-1][:20]
plt.figure(figsize=(8,6))
plt.barh(range(len(idx)), pi.importances_mean[idx][::-1])
plt.yticks(range(len(idx)), np.array(feat_cols)[idx][::-1], fontsize=9)
plt.title("Permutation importance – TEST"); plt.tight_layout(); plt.show()
No description has been provided for this image

Error Analysis - list top FNs & FPs¶

In [59]:
errs = pd.DataFrame({"session": g_test, "y": y_test, "proba": proba})
errs["yhat_thr"] = (errs["proba"] >= best_thr).astype(int)
fn = errs[(errs["y"]==1) & (errs["yhat_thr"]==0)].sort_values("proba")      # missed incidents
fp = errs[(errs["y"]==0) & (errs["yhat_thr"]==1)].sort_values("proba", ascending=False)  # false alarms

print("Top 10 False Negatives (worst misses):")
display(fn.head(10))
print("Top 10 False Positives (high-confidence false alarms):")
display(fp.head(10))

# Save for review
REPORT_DIR = WORK / "eval_reports"; REPORT_DIR.mkdir(exist_ok=True)
fn.head(200).to_csv(REPORT_DIR/"false_negatives.csv", index=False)
fp.head(200).to_csv(REPORT_DIR/"false_positives.csv", index=False)
print("Saved:", REPORT_DIR)
Top 10 False Negatives (worst misses):
session y proba yhat_thr
Top 10 False Positives (high-confidence false alarms):
session y proba yhat_thr
Saved: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eval_reports

Export a compact evaluation report (CSV + JSON)¶

In [60]:
import json, time, pandas as pd, numpy as np

stamp = time.strftime("%Y%m%d_%H%M%S")
summary = {
    "timestamp": stamp,
    "model_file": model_path.name if "model_path" in globals() else "in-memory",
    "thr": best_thr,
    "test_metrics_default_thr": {k: float(v) for k,v in metrics_test.items() if k not in ("proba","yhat")},
    "test_metrics_tuned_thr": {
        "acc": float((errs["y"]==(errs["proba"]>=best_thr).astype(int)).mean()),
        "prec": float((fp.shape[0]==0 and 0.0) or (1 - fp.shape[0]/max(1, (errs["yhat_thr"]==1).sum()))),  # quick approx; formal calc below
    }
}

# Formal precision/recall/F1 at tuned threshold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
yhat_tuned = (proba >= best_thr).astype(int)
p, r, f1, _ = precision_recall_fscore_support(y_test, yhat_tuned, average="binary", zero_division=0)
summary["test_metrics_tuned_thr"].update({
    "precision": float(p), "recall": float(r), "f1": float(f1)
})

# Save
report_dir = WORK/"eval_reports"; report_dir.mkdir(exist_ok=True)
json.dump(summary, open(report_dir/f"evaluation_{stamp}.json","w"), indent=2)

# Per-session table too
per_session.to_csv(report_dir/f"per_session_{stamp}.csv", index=False)

print("Saved reports to:", report_dir)
Saved reports to: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eval_reports

Testing Purpose¶

Assignment 2 Testing¶

In [1]:
# Pick the numeric columns you want to report in Assignment 2
num_cols = ['gps_speed', 'speed_drop']  # add others you used, e.g., 'acc_x', 'acc_y'...

# Compute mean, median, mode, and midrange
summary = {}
for col in num_cols:
    s = df[col].dropna()
    mean_ = s.mean()
    median_ = s.median()
    # mode() can return multiple values; take the first if any
    mode_ = s.mode().iloc[0] if not s.mode().empty else float('nan')
    midrange_ = (s.max() + s.min()) / 2.0
    summary[col] = {
        'count': s.shape[0],
        'mean': mean_,
        'median': median_,
        'mode': mode_,
        'min': s.min(),
        'max': s.max(),
        'midrange': midrange_
    }

import pandas as pd
ct_df = pd.DataFrame(summary).T
ct_df
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[1], line 7
      5 summary = {}
      6 for col in num_cols:
----> 7     s = df[col].dropna()
      8     mean_ = s.mean()
      9     median_ = s.median()

NameError: name 'df' is not defined
In [2]:
midrange_only = {col: (df[col].max() + df[col].min())/2 for col in num_cols}
pd.Series(midrange_only, name='midrange')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[2], line 1
----> 1 midrange_only = {col: (df[col].max() + df[col].min())/2 for col in num_cols}
      2 pd.Series(midrange_only, name='midrange')

NameError: name 'df' is not defined
In [3]:
import pandas as pd

# replace with your actual path and file name
df = pd.read_parquet(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\clean_resampled\PVS 1.parquet")

print("Data loaded successfully:", df.shape)
df.head()
Data loaded successfully: (72019, 34)
Out[3]:
gps_acc_x_dashboard gps_acc_y_dashboard gps_acc_z_dashboard gps_acc_x_above_suspension gps_acc_y_above_suspension gps_acc_z_above_suspension gps_acc_x_below_suspension gps_acc_y_below_suspension gps_acc_z_below_suspension gps_gyro_x_dashboard ... gps_temp_dashboard gps_temp_above_suspension gps_temp_below_suspension gps_timestamp_gps gps_latitude gps_longitude gps_speed session speed_drop proxy_incident
timestamp
2019-12-24 20:19:56.540 0.314897 0.187227 9.863572 0.314750 0.166426 9.808869 0.529819 0.097111 9.930623 0.221062 ... 34.274628 34.035014 31.926408 1.577219e+09 -27.717841 -51.098865 0.009128 PVS 1 0.0 0
2019-12-24 20:19:56.560 0.297539 0.187227 9.869558 0.313553 0.154455 9.855556 0.525031 0.100702 9.948579 0.045586 ... 34.358493 34.082936 31.734717 1.577219e+09 -27.717841 -51.098865 0.009128 PVS 1 0.0 0
2019-12-24 20:19:56.580 0.308912 0.199198 9.842024 0.332706 0.159244 9.831614 0.533411 0.092323 9.887527 0.175285 ... 34.370474 33.939168 31.447180 1.577219e+09 -27.717841 -51.098865 0.009128 PVS 1 0.0 0
2019-12-24 20:19:56.600 0.317292 0.157299 9.859981 0.297991 0.156849 9.824431 0.498695 0.404766 9.923440 0.205803 ... 34.514242 33.963129 31.447180 1.577219e+09 -27.717841 -51.098865 0.009128 PVS 1 0.0 0
2019-12-24 20:19:56.620 0.295744 0.148919 9.885120 0.266866 0.156849 9.835205 0.496300 0.097111 9.856403 -0.206184 ... 34.370474 34.082936 31.638871 1.577219e+09 -27.717841 -51.098865 0.009128 PVS 1 0.0 0

5 rows × 34 columns

In [4]:
df.columns
Out[4]:
Index(['gps_acc_x_dashboard', 'gps_acc_y_dashboard', 'gps_acc_z_dashboard',
       'gps_acc_x_above_suspension', 'gps_acc_y_above_suspension',
       'gps_acc_z_above_suspension', 'gps_acc_x_below_suspension',
       'gps_acc_y_below_suspension', 'gps_acc_z_below_suspension',
       'gps_gyro_x_dashboard', 'gps_gyro_y_dashboard', 'gps_gyro_z_dashboard',
       'gps_gyro_x_above_suspension', 'gps_gyro_y_above_suspension',
       'gps_gyro_z_above_suspension', 'gps_gyro_x_below_suspension',
       'gps_gyro_y_below_suspension', 'gps_gyro_z_below_suspension',
       'gps_mag_x_dashboard', 'gps_mag_y_dashboard', 'gps_mag_z_dashboard',
       'gps_mag_x_above_suspension', 'gps_mag_y_above_suspension',
       'gps_mag_z_above_suspension', 'gps_temp_dashboard',
       'gps_temp_above_suspension', 'gps_temp_below_suspension',
       'gps_timestamp_gps', 'gps_latitude', 'gps_longitude', 'gps_speed',
       'session', 'speed_drop', 'proxy_incident'],
      dtype='object')
In [5]:
num_cols = ['gps_speed', 'acc_mag', 'gyro_mag', 'speed_drop']

summary = {}
for col in num_cols:
    s = df[col].dropna()
    mean_ = s.mean()
    median_ = s.median()
    mode_ = s.mode().iloc[0] if not s.mode().empty else float('nan')
    midrange_ = (s.max() + s.min()) / 2.0
    summary[col] = {
        'count': s.shape[0],
        'mean': mean_,
        'median': median_,
        'mode': mode_,
        'min': s.min(),
        'max': s.max(),
        'midrange': midrange_
    }

import pandas as pd
ct_df = pd.DataFrame(summary).T
print(ct_df)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\indexes\base.py:3805, in Index.get_loc(self, key)
   3804 try:
-> 3805     return self._engine.get_loc(casted_key)
   3806 except KeyError as err:

File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas\\_libs\\hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas\\_libs\\hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'acc_mag'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[5], line 5
      3 summary = {}
      4 for col in num_cols:
----> 5     s = df[col].dropna()
      6     mean_ = s.mean()
      7     median_ = s.median()

File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\frame.py:4102, in DataFrame.__getitem__(self, key)
   4100 if self.columns.nlevels > 1:
   4101     return self._getitem_multilevel(key)
-> 4102 indexer = self.columns.get_loc(key)
   4103 if is_integer(indexer):
   4104     indexer = [indexer]

File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\indexes\base.py:3812, in Index.get_loc(self, key)
   3807     if isinstance(casted_key, slice) or (
   3808         isinstance(casted_key, abc.Iterable)
   3809         and any(isinstance(x, slice) for x in casted_key)
   3810     ):
   3811         raise InvalidIndexError(key)
-> 3812     raise KeyError(key) from err
   3813 except TypeError:
   3814     # If we have a listlike key, _check_indexing_error will raise
   3815     #  InvalidIndexError. Otherwise we fall through and re-raise
   3816     #  the TypeError.
   3817     self._check_indexing_error(key)

KeyError: 'acc_mag'
In [6]:
['gps_acc_x_dashboard', 'gps_acc_y_dashboard', 'gps_acc_z_dashboard',
 'gps_acc_x_above_suspension', 'gps_acc_y_above_suspension', 'gps_acc_z_above_suspension',
 'gps_acc_x_below_suspension', 'gps_acc_y_below_suspension', 'gps_acc_z_below_suspension',
 'gps_gyro_x_dashboard', 'gps_gyro_y_dashboard', 'gps_gyro_z_dashboard',
 ...
 'gps_speed', 'session', 'speed_drop', 'proxy_incident']
  Cell In[6], line 5
    ...
    ^
SyntaxError: invalid syntax. Perhaps you forgot a comma?
In [7]:
import numpy as np

# Acceleration magnitude (using dashboard sensor)
df["acc_mag"] = np.sqrt(
    df["gps_acc_x_dashboard"]**2 +
    df["gps_acc_y_dashboard"]**2 +
    df["gps_acc_z_dashboard"]**2
)

# Gyroscope magnitude (using dashboard sensor)
df["gyro_mag"] = np.sqrt(
    df["gps_gyro_x_dashboard"]**2 +
    df["gps_gyro_y_dashboard"]**2 +
    df["gps_gyro_z_dashboard"]**2
)

print("Added new columns:", [c for c in df.columns if "mag" in c])
Added new columns: ['gps_mag_x_dashboard', 'gps_mag_y_dashboard', 'gps_mag_z_dashboard', 'gps_mag_x_above_suspension', 'gps_mag_y_above_suspension', 'gps_mag_z_above_suspension', 'acc_mag', 'gyro_mag']

Central Tendancy¶

In [8]:
num_cols = ['gps_speed', 'acc_mag', 'gyro_mag', 'speed_drop']

summary = {}
for col in num_cols:
    s = df[col].dropna()
    mean_ = s.mean()
    median_ = s.median()
    mode_ = s.mode().iloc[0] if not s.mode().empty else float('nan')
    midrange_ = (s.max() + s.min()) / 2.0
    summary[col] = {
        'count': s.shape[0],
        'mean': mean_,
        'median': median_,
        'mode': mode_,
        'min': s.min(),
        'max': s.max(),
        'midrange': midrange_
    }

import pandas as pd
ct_df = pd.DataFrame(summary).T
ct_df
Out[8]:
count mean median mode min max midrange
gps_speed 72019.0 9.556739 6.618945 0.005715 0.002526 26.874480 13.438503
acc_mag 72019.0 9.948794 9.859017 9.792895 0.409363 27.451342 13.930352
gyro_mag 72019.0 6.070509 4.775196 4.116235 0.009907 54.310432 27.160169
speed_drop 72019.0 0.159041 0.004228 0.000000 0.000000 3.895296 1.947648
In [9]:
import matplotlib.pyplot as plt

for col in num_cols:
    s = df[col].dropna()
    plt.figure(figsize=(8,4))
    plt.hist(s, bins=50, alpha=0.7, edgecolor='black')
    plt.axvline(s.mean(), color='red', linestyle='--', linewidth=2, label=f"Mean = {s.mean():.2f}")
    plt.axvline(s.median(), color='green', linestyle=':', linewidth=2, label=f"Median = {s.median():.2f}")
    plt.title(f"Histogram of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.legend()
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [10]:
for col in num_cols:
    df[col].plot(kind='kde', linewidth=2, figsize=(7,3), title=f"KDE Plot of {col}")
    plt.xlabel(col)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [11]:
df[num_cols].plot.box(figsize=(7,4))
plt.title("Boxplot of Key Sensor Features")
plt.ylabel("Value")
plt.show()
No description has been provided for this image
In [3]:
ct_df.to_csv("central_tendency_summary.csv")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 1
----> 1 ct_df.to_csv("central_tendency_summary.csv")

NameError: name 'ct_df' is not defined
In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load cleaned dataset (example session)
df = pd.read_parquet(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\clean_resampled\PVS 1.parquet")

# Example: histogram and correlation heatmap
plt.figure(figsize=(8,4))
plt.hist(df['gps_speed'], bins=50, color='skyblue', edgecolor='black')
plt.title("Histogram of Vehicle Speed")
plt.xlabel("Speed (km/h)")
plt.ylabel("Frequency")
plt.show()

plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), cmap='coolwarm', center=0)
plt.title("Correlation Heatmap of Sensor Features")
plt.tight_layout()
plt.show()
No description has been provided for this image
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[2], line 17
     14 plt.show()
     16 plt.figure(figsize=(8,6))
---> 17 sns.heatmap(df.corr(), cmap='coolwarm', center=0)
     18 plt.title("Correlation Heatmap of Sensor Features")
     19 plt.tight_layout()

File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\frame.py:11049, in DataFrame.corr(self, method, min_periods, numeric_only)
  11047 cols = data.columns
  11048 idx = cols.copy()
> 11049 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
  11051 if method == "pearson":
  11052     correl = libalgos.nancorr(mat, minp=min_periods)

File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\frame.py:1993, in DataFrame.to_numpy(self, dtype, copy, na_value)
   1991 if dtype is not None:
   1992     dtype = np.dtype(dtype)
-> 1993 result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value)
   1994 if result.dtype is not dtype:
   1995     result = np.asarray(result, dtype=dtype)

File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\internals\managers.py:1694, in BlockManager.as_array(self, dtype, copy, na_value)
   1692         arr.flags.writeable = False
   1693 else:
-> 1694     arr = self._interleave(dtype=dtype, na_value=na_value)
   1695     # The underlying data was copied within _interleave, so no need
   1696     # to further copy if copy=True or setting na_value
   1698 if na_value is lib.no_default:

File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\internals\managers.py:1753, in BlockManager._interleave(self, dtype, na_value)
   1751     else:
   1752         arr = blk.get_values(dtype)
-> 1753     result[rl.indexer] = arr
   1754     itemmask[rl.indexer] = 1
   1756 if not itemmask.all():

ValueError: could not convert string to float: 'PVS 1'
<Figure size 800x600 with 0 Axes>
In [2]:
df = pd.read_parquet(DATA)
print("Shape:", df.shape)
display(df.head(5))

# (Optional) show columns
pd.Series(df.columns, name="columns").to_frame().head(25)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[2], line 1
----> 1 df = pd.read_parquet(DATA)
      2 print("Shape:", df.shape)
      3 display(df.head(5))

NameError: name 'pd' is not defined
In [3]:
# Core libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# I/O paths (edit to your actual file)
DATA = r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\clean_resampled\PVS 1.parquet"
# ART = r"C:\Users\<you>\Desktop\Accident_Detection_Project_Dataset\report_artifacts"  # where images/csvs will be saved
In [4]:
df = pd.read_parquet(DATA)
print("Shape:", df.shape)
display(df.head(5))

# (Optional) show columns
pd.Series(df.columns, name="columns").to_frame().head(25)
Shape: (72019, 34)
gps_acc_x_dashboard gps_acc_y_dashboard gps_acc_z_dashboard gps_acc_x_above_suspension gps_acc_y_above_suspension gps_acc_z_above_suspension gps_acc_x_below_suspension gps_acc_y_below_suspension gps_acc_z_below_suspension gps_gyro_x_dashboard ... gps_temp_dashboard gps_temp_above_suspension gps_temp_below_suspension gps_timestamp_gps gps_latitude gps_longitude gps_speed session speed_drop proxy_incident
timestamp
2019-12-24 20:19:56.540 0.314897 0.187227 9.863572 0.314750 0.166426 9.808869 0.529819 0.097111 9.930623 0.221062 ... 34.274628 34.035014 31.926408 1.577219e+09 -27.717841 -51.098865 0.009128 PVS 1 0.0 0
2019-12-24 20:19:56.560 0.297539 0.187227 9.869558 0.313553 0.154455 9.855556 0.525031 0.100702 9.948579 0.045586 ... 34.358493 34.082936 31.734717 1.577219e+09 -27.717841 -51.098865 0.009128 PVS 1 0.0 0
2019-12-24 20:19:56.580 0.308912 0.199198 9.842024 0.332706 0.159244 9.831614 0.533411 0.092323 9.887527 0.175285 ... 34.370474 33.939168 31.447180 1.577219e+09 -27.717841 -51.098865 0.009128 PVS 1 0.0 0
2019-12-24 20:19:56.600 0.317292 0.157299 9.859981 0.297991 0.156849 9.824431 0.498695 0.404766 9.923440 0.205803 ... 34.514242 33.963129 31.447180 1.577219e+09 -27.717841 -51.098865 0.009128 PVS 1 0.0 0
2019-12-24 20:19:56.620 0.295744 0.148919 9.885120 0.266866 0.156849 9.835205 0.496300 0.097111 9.856403 -0.206184 ... 34.370474 34.082936 31.638871 1.577219e+09 -27.717841 -51.098865 0.009128 PVS 1 0.0 0

5 rows × 34 columns

Out[4]:
columns
0 gps_acc_x_dashboard
1 gps_acc_y_dashboard
2 gps_acc_z_dashboard
3 gps_acc_x_above_suspension
4 gps_acc_y_above_suspension
5 gps_acc_z_above_suspension
6 gps_acc_x_below_suspension
7 gps_acc_y_below_suspension
8 gps_acc_z_below_suspension
9 gps_gyro_x_dashboard
10 gps_gyro_y_dashboard
11 gps_gyro_z_dashboard
12 gps_gyro_x_above_suspension
13 gps_gyro_y_above_suspension
14 gps_gyro_z_above_suspension
15 gps_gyro_x_below_suspension
16 gps_gyro_y_below_suspension
17 gps_gyro_z_below_suspension
18 gps_mag_x_dashboard
19 gps_mag_y_dashboard
20 gps_mag_z_dashboard
21 gps_mag_x_above_suspension
22 gps_mag_y_above_suspension
23 gps_mag_z_above_suspension
24 gps_temp_dashboard
In [5]:
# --- A2-1b. Engineer common features if missing ---
def ensure_feature(df, name, expr):
    if name not in df.columns:
        df[name] = expr(df)

ensure_feature(df, "acc_mag",
               lambda d: np.sqrt(d.filter(like="gps_acc_").pow(2).sum(axis=1)))
ensure_feature(df, "gyro_mag",
               lambda d: np.sqrt(d.filter(like="gps_gyro_").pow(2).sum(axis=1)))
ensure_feature(df, "speed_drop",
               lambda d: d["gps_speed"].diff().clip(upper=0).abs().fillna(0))

# target column should exist (rename to your project target if needed)
target = "proxy_incident"  # 0/1
assert target in df.columns, "Add/rename your label column to 'proxy_incident'."

Central Tendancy¶

In [7]:
num_cols = ["gps_speed", "acc_mag", "gyro_mag", "speed_drop"]

summary_ct = {}
for col in num_cols:
    s = df[col].dropna()
    mode_vals = s.mode()
    summary_ct[col] = {
        "count": int(s.shape[0]),
        "mean":  float(s.mean()),
        "median": float(s.median()),
        "mode":  float(mode_vals.iloc[0]) if not mode_vals.empty else np.nan,
        "min":   float(s.min()),
        "max":   float(s.max()),
        "midrange": float((s.max() + s.min())/2.0),
    }

ct_df = pd.DataFrame(summary_ct).T.round(4)
display(ct_df)

# (optional) save
#ct_df.to_csv(f"{ART}/A2_central_tendency.csv", index=True)
count mean median mode min max midrange
gps_speed 72019.0 9.5567 6.6189 0.0057 0.0025 26.8745 13.4385
acc_mag 72019.0 18.4213 17.3682 12.9027 4.0950 71.4404 37.7677
gyro_mag 72019.0 27.7976 22.8371 61.5897 0.0895 267.2069 133.6482
speed_drop 72019.0 0.1590 0.0042 0.0000 0.0000 3.8953 1.9476

Dispersion¶

In [10]:
disp_rows = []
for col in num_cols:
    s = df[col].dropna()
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    disp_rows.append({
        "feature": col,
        "range": float(s.max() - s.min()),
        "Q1": float(q1),
        "Q3": float(q3),
        "IQR": float(q3 - q1),
        "variance": float(s.var()),
        "std_dev": float(s.std())
    })

disp_df = pd.DataFrame(disp_rows).set_index("feature").round(4)
display(disp_df)
#disp_df.to_csv(f"{ART}/A2_dispersion.csv")

# Optional supporting boxplot (one figure with four boxes)
plt.figure(figsize=(7,4))
sns.boxplot(data=df[num_cols], orient="h")
plt.title("Dispersion via Boxplots (PVS_1)")
plt.tight_layout()
#plt.savefig(f"{ART}/A2_boxplots.png", dpi=300)
plt.show()
range Q1 Q3 IQR variance std_dev
feature
gps_speed 26.8720 4.5089 16.6475 12.1386 60.0073 7.7464
acc_mag 67.3453 16.1825 19.8167 3.6342 17.3873 4.1698
gyro_mag 267.1173 12.7269 37.5753 24.8484 503.6426 22.4420
speed_drop 3.8953 0.0000 0.2260 0.2260 0.0816 0.2856
No description has been provided for this image

Skewness¶

In [11]:
skew_tbl = df[num_cols].skew(numeric_only=True).to_frame("skew").round(4)
display(skew_tbl)
#skew_tbl.to_csv(f"{ART}/A2_skewness.csv")

# One supporting histogram/KDE for a key feature
plt.figure(figsize=(6,3.5))
sns.kdeplot(df["gps_speed"].dropna(), fill=True)
plt.title("KDE: gps_speed (PVS_1)")
plt.tight_layout()
#plt.savefig(f"{ART}/A2_kde_speed.png", dpi=300)
plt.show()
skew
gps_speed 0.8553
acc_mag 2.0755
gyro_mag 1.7439
speed_drop 3.8867
No description has been provided for this image
In [12]:
cls_counts = df[target].value_counts().sort_index()
display(cls_counts.to_frame("count"))

plt.figure(figsize=(4.5,3.5))
cls_counts.plot(kind="bar", color=["#4C78A8", "#F58518"])
plt.title("Class Counts (0=No Accident, 1=Accident)")
plt.xlabel("proxy_incident")
plt.ylabel("count")
plt.tight_layout()
#plt.savefig(f"{ART}/A2_class_counts.png", dpi=300)
plt.show()

# Optional % table
(cls_counts / cls_counts.sum()).round(4).to_frame("proportion")
count
proxy_incident
0 72019
No description has been provided for this image
Out[12]:
proportion
proxy_incident
0 1.0

Correlation¶

In [13]:
# choose only numeric columns used downstream
corr = df[num_cols].corr(numeric_only=True)

plt.figure(figsize=(5.2,4.5))
sns.heatmap(corr, vmin=-1, vmax=1, annot=True, cmap="coolwarm", square=True, cbar_kws={'shrink': .75})
plt.title("Correlation: Selected PVS Features")
plt.tight_layout()
#plt.savefig(f"{ART}/A2_corr_heatmap.png", dpi=300)
plt.show()
No description has been provided for this image
In [14]:
# Proxy label distribution
if "proxy_incident" in df.columns:
    vc = df["proxy_incident"].value_counts()
    print("Proxy incident counts:\n", vc)
    vc.plot(kind="bar")
    plt.title("Proxy incident distribution (per timestamp)")
    plt.xticks(rotation=0); plt.show()
Proxy incident counts:
 proxy_incident
0    72019
Name: count, dtype: int64
No description has been provided for this image
In [16]:
# Libraries
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Paths
DATA = r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\clean_resampled\PVS 1.parquet"
ART  = r"C:\Users\<you>\Desktop\Accident_Detection_Project_Dataset\report_artifacts"

# Load
df = pd.read_parquet(DATA)

# Ensure engineered features exist (same as A2)
def ensure_feature(df, name, expr):
    if name not in df.columns:
        df[name] = expr(df)

ensure_feature(df, "acc_mag",
               lambda d: np.sqrt(d.filter(like="gps_acc_").pow(2).sum(axis=1)))
ensure_feature(df, "gyro_mag",
               lambda d: np.sqrt(d.filter(like="gps_gyro_").pow(2).sum(axis=1)))
ensure_feature(df, "speed_drop",
               lambda d: d["gps_speed"].diff().clip(upper=0).abs().fillna(0))

target = "proxy_incident"     # 0/1 label column
assert target in df.columns

# Cast numeric columns used in plots
num_cols = ["gps_speed", "acc_mag", "gyro_mag", "speed_drop"]
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")

# Small helper to save/show uniformly
def save_fig(path):
    plt.tight_layout()
    plt.savefig(path, dpi=300)
    plt.show()
In [18]:
ts = df.set_index("timestamp").sort_index()
ts_sample = ts[num_cols].resample("1S").mean().iloc[:600]  # first 10 minutes
plt.figure(figsize=(9,3.5))
plt.plot(ts_sample.index, ts_sample["gps_speed"])
plt.title("Line Plot: gps_speed over time (first 10 minutes)")
plt.xlabel("Time"); plt.ylabel("Speed (km/h)")
#save_fig(f"{ART}/A3_line_speed.png")
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_11892\439280025.py in ?()
----> 1 ts = df.set_index("timestamp").sort_index()
      2 ts_sample = ts[num_cols].resample("1S").mean().iloc[:600]  # first 10 minutes
      3 plt.figure(figsize=(9,3.5))
      4 plt.plot(ts_sample.index, ts_sample["gps_speed"])

C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\frame.py in ?(self, keys, drop, append, inplace, verify_integrity)
   6118                     if not found:
   6119                         missing.append(col)
   6120 
   6121         if missing:
-> 6122             raise KeyError(f"None of {missing} are in the columns")
   6123 
   6124         if inplace:
   6125             frame = self

KeyError: "None of ['timestamp'] are in the columns"

Assignment 2 Testing¶

In [1]:
# --- PATHS ---
from pathlib import Path
BASE = Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
WORK = BASE / "work"
CLEAN_DIR = WORK / "clean_resampled"
EDA_DIR = WORK / "eda_reports"

EDA_DIR.mkdir(parents=True, exist_ok=True)

# --- LIBS ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import chain
plt.rcParams["figure.dpi"] = 140

# --- HELPERS ---
def save_fig(name):
    out = EDA_DIR / f"{name}.png"
    plt.tight_layout()
    plt.savefig(out, bbox_inches="tight")
    print("Saved figure:", out)

def save_csv(df, name):
    out = EDA_DIR / f"{name}.csv"
    df.to_csv(out, index=False)
    print("Saved CSV:", out)

def pick_first_existing(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None
In [2]:
# Load a single cleaned parquet to prototype EDA (PVS 1)
one = CLEAN_DIR / "PVS 1.parquet"
df = pd.read_parquet(one)
print(df.shape)
df.head(3)
(72019, 34)
Out[2]:
gps_acc_x_dashboard gps_acc_y_dashboard gps_acc_z_dashboard gps_acc_x_above_suspension gps_acc_y_above_suspension gps_acc_z_above_suspension gps_acc_x_below_suspension gps_acc_y_below_suspension gps_acc_z_below_suspension gps_gyro_x_dashboard ... gps_temp_dashboard gps_temp_above_suspension gps_temp_below_suspension gps_timestamp_gps gps_latitude gps_longitude gps_speed session speed_drop proxy_incident
timestamp
2019-12-24 20:19:56.540 0.314897 0.187227 9.863572 0.314750 0.166426 9.808869 0.529819 0.097111 9.930623 0.221062 ... 34.274628 34.035014 31.926408 1.577219e+09 -27.717841 -51.098865 0.009128 PVS 1 0.0 0
2019-12-24 20:19:56.560 0.297539 0.187227 9.869558 0.313553 0.154455 9.855556 0.525031 0.100702 9.948579 0.045586 ... 34.358493 34.082936 31.734717 1.577219e+09 -27.717841 -51.098865 0.009128 PVS 1 0.0 0
2019-12-24 20:19:56.580 0.308912 0.199198 9.842024 0.332706 0.159244 9.831614 0.533411 0.092323 9.887527 0.175285 ... 34.370474 33.939168 31.447180 1.577219e+09 -27.717841 -51.098865 0.009128 PVS 1 0.0 0

3 rows × 34 columns

In [3]:
# Detect label
label_col = pick_first_existing(df, ["proxy_incident", "label", "target", "y", "class"])
print("Label column:", label_col)

# Candidate numeric features (use what exists)
candidates = ["gps_speed", "acc_mag", "gyro_mag", "speed_drop"]

num_cols = [c for c in candidates if c in df.columns]
print("Numeric features found:", num_cols)

# Drop fully-empty columns if any
df = df.dropna(axis=1, how="all")
Label column: proxy_incident
Numeric features found: ['gps_speed', 'speed_drop']
In [4]:
miss = df.isna().sum().reset_index()
miss.columns = ["column", "missing_count"]
miss["missing_pct"] = (miss["missing_count"]/len(df)).round(4)
save_csv(miss, "missing_values")
Saved CSV: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\missing_values.csv
In [5]:
summary_rows = []
for col in num_cols:
    s = df[col].dropna()
    mode_ = s.mode().iloc[0] if not s.mode().empty else np.nan
    midrange = (s.min() + s.max()) / 2.0
    summary_rows.append({
        "feature": col,
        "count": int(s.shape[0]),
        "mean": s.mean(),
        "median": s.median(),
        "mode": mode_,
        "min": s.min(),
        "max": s.max(),
        "midrange": midrange
    })

summary_df = pd.DataFrame(summary_rows)
save_csv(summary_df, "numeric_summary")
summary_df
Saved CSV: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\numeric_summary.csv
Out[5]:
feature count mean median mode min max midrange
0 gps_speed 72019 9.556739 6.618945 0.005715 0.002526 26.874480 13.438503
1 speed_drop 72019 0.159041 0.004228 0.000000 0.000000 3.895296 1.947648
In [6]:
for col in num_cols:
    s = df[col].dropna()
    fig, ax = plt.subplots(figsize=(6,3))
    ax.hist(s, bins=60, alpha=0.7)
    ax.axvline(s.mean(), color="r", linestyle="--", label=f"Mean={s.mean():.2f}")
    ax.axvline(s.median(), color="g", linestyle="-.", label=f"Median={s.median():.2f}")
    ax.set_title(f"Histogram with Mean/Median — {col}")
    ax.legend()
    save_fig(f"central_tendency_{col}")
    plt.close(fig)
Saved figure: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\central_tendency_gps_speed.png
Saved figure: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\central_tendency_speed_drop.png
In [7]:
disp_rows = []
for col in num_cols:
    s = df[col].dropna()
    q1, q2, q3 = s.quantile([0.25, 0.50, 0.75])
    disp_rows.append({
        "feature": col,
        "range": s.max() - s.min(),
        "q1": q1,
        "median(q2)": q2,
        "q3": q3,
        "iqr": q3 - q1,
        "variance": s.var(),
        "std": s.std()
    })
disp_df = pd.DataFrame(disp_rows)
save_csv(disp_df, "dispersion_table")
disp_df
Saved CSV: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\dispersion_table.csv
Out[7]:
feature range q1 median(q2) q3 iqr variance std
0 gps_speed 26.871954 4.508887 6.618945 16.647470 12.138583 60.007341 7.746441
1 speed_drop 3.895296 0.000000 0.004228 0.225994 0.225994 0.081572 0.285608
In [8]:
for col in num_cols:
    fig, ax = plt.subplots(1, 2, figsize=(9,3))
    sns.boxplot(x=df[col], ax=ax[0])
    ax[0].set_title(f"Box — {col}")
    sns.violinplot(x=df[col], ax=ax[1])
    ax[1].set_title(f"Violin — {col}")
    save_fig(f"box_violin_{col}")
    plt.close(fig)
Saved figure: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\box_violin_gps_speed.png
Saved figure: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\box_violin_speed_drop.png
In [9]:
sk_rows = []
for col in num_cols:
    s = df[col].dropna()
    sk_rows.append({"feature": col, "skewness": s.skew()})
    fig, ax = plt.subplots(figsize=(6,3))
    sns.kdeplot(s, fill=True, ax=ax)
    ax.set_title(f"KDE & Skewness — {col} (skew={s.skew():.2f})")
    save_fig(f"skew_kde_{col}")
    plt.close(fig)

sk_df = pd.DataFrame(sk_rows)
save_csv(sk_df, "skewness_table")
sk_df
Saved figure: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\skew_kde_gps_speed.png
Saved figure: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\skew_kde_speed_drop.png
Saved CSV: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\skewness_table.csv
Out[9]:
feature skewness
0 gps_speed 0.855313
1 speed_drop 3.886691

Class Imbalance¶

In [10]:
if label_col:
    counts = df[label_col].value_counts().rename_axis("class").reset_index(name="count")
    counts["pct"] = (counts["count"]/counts["count"].sum()).round(4)
    save_csv(counts, "class_counts")
    print(counts)

    # Bar chart
    fig, ax = plt.subplots(figsize=(4,3))
    sns.barplot(x="class", y="count", data=counts, ax=ax)
    ax.set_title("Class Balance (Counts)")
    for i, v in enumerate(counts["count"]):
        ax.text(i, v, str(v), ha="center", va="bottom")
    save_fig("class_balance")
    plt.close(fig)
Saved CSV: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\class_counts.csv
   class  count  pct
0      0  72019  1.0
Saved figure: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\class_balance.png
In [11]:
num_for_corr = df.select_dtypes(include=[np.number]).drop(columns=[label_col], errors="ignore")
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(num_for_corr.corr(), cmap="coolwarm", center=0)
ax.set_title("Correlation Heatmap of Sensor Features")
save_fig("corr_heatmap")
plt.close(fig)
Saved figure: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\corr_heatmap.png
In [12]:
# If you want bigger sample for EDA: concatenate PVS 1..3
files = [CLEAN_DIR/f"PVS {i}.parquet" for i in [1,2,3] if (CLEAN_DIR/f"PVS {i}.parquet").exists()]
big = pd.concat([pd.read_parquet(p) for p in files], ignore_index=True)
big.shape
# You can rerun sections 2–8 with df = big
Out[12]:
(187270, 34)
In [13]:
import matplotlib.pyplot as plt
import seaborn as sns

feature = "gps_speed"  # you can also try 'acc_mag' or 'gyro_mag'
s = df[feature].dropna()

plt.figure(figsize=(7,4))
sns.histplot(s, bins=50, color='skyblue', edgecolor='black', kde=True)
plt.axvline(s.mean(), color='red', linestyle='--', linewidth=2, label=f"Mean = {s.mean():.2f}")
plt.axvline(s.median(), color='green', linestyle=':', linewidth=2, label=f"Median = {s.median():.2f}")
plt.title(f"Histogram of {feature} with Mean and Median")
plt.xlabel(feature)
plt.ylabel("Frequency")
plt.legend()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [14]:
means = df[["gps_speed", "acc_mag", "gyro_mag", "speed_drop"]].mean()
plt.figure(figsize=(6,4))
means.plot(kind='bar', color=['#5DADE2', '#F5B041', '#58D68D', '#AF7AC5'])
plt.title("Mean Values of Key Sensor Features")
plt.xlabel("Feature")
plt.ylabel("Mean")
plt.tight_layout()
plt.show()
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[14], line 1
----> 1 means = df[["gps_speed", "acc_mag", "gyro_mag", "speed_drop"]].mean()
      2 plt.figure(figsize=(6,4))
      3 means.plot(kind='bar', color=['#5DADE2', '#F5B041', '#58D68D', '#AF7AC5'])

File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\frame.py:4108, in DataFrame.__getitem__(self, key)
   4106     if is_iterator(key):
   4107         key = list(key)
-> 4108     indexer = self.columns._get_indexer_strict(key, "columns")[1]
   4110 # take() does not accept boolean indexers
   4111 if getattr(indexer, "dtype", None) == bool:

File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\indexes\base.py:6200, in Index._get_indexer_strict(self, key, axis_name)
   6197 else:
   6198     keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-> 6200 self._raise_if_missing(keyarr, indexer, axis_name)
   6202 keyarr = self.take(indexer)
   6203 if isinstance(key, Index):
   6204     # GH 42790 - Preserve name from an Index

File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\indexes\base.py:6252, in Index._raise_if_missing(self, key, indexer, axis_name)
   6249     raise KeyError(f"None of [{key}] are in the [{axis_name}]")
   6251 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
-> 6252 raise KeyError(f"{not_found} not in index")

KeyError: "['acc_mag', 'gyro_mag'] not in index"
In [15]:
plt.figure(figsize=(4,3.5))
df["proxy_incident"].value_counts().sort_index().plot(kind='bar', color=["#3498DB","#E74C3C"], edgecolor='black')
plt.title("Class Distribution: No Accident (0) vs Accident (1)")
plt.xlabel("Class Label")
plt.ylabel("Count")
plt.xticks([0,1], ["No Accident", "Accident"], rotation=0)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [16]:
plt.figure(figsize=(4,4))
df["proxy_incident"].value_counts().sort_index().plot.pie(
    autopct='%1.1f%%', colors=["#5DADE2","#E74C3C"], labels=["No Accident", "Accident"],
    startangle=90, wedgeprops={"edgecolor":"white"}
)
plt.title("Proportion of Accident vs Non-Accident Samples")
plt.ylabel("")  # remove y-label
plt.tight_layout()
plt.show()
No description has been provided for this image
In [17]:
class_stats = df["proxy_incident"].value_counts().rename_axis("Class").reset_index(name="Count")
class_stats["Percentage"] = (class_stats["Count"]/class_stats["Count"].sum()*100).round(2)
class_stats
Out[17]:
Class Count Percentage
0 0 72019 100.0

Assignemnt 3 - Graphs¶

In [18]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# ---- paths (match your folder layout) ----
BASE = Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
WORK = BASE / "work"
CLEAN = WORK / "clean_resampled"        # where your PVS 1..9 parquet live
EDA   = WORK / "eda_reports" / "visuals"
EDA.mkdir(parents=True, exist_ok=True)

# ---- load one or merge a few sessions (example: PVS 1) ----
df = pd.read_parquet(CLEAN / "PVS 1.parquet")

# label column name used in your work
LABEL = "proxy_incident"   # 0: no-accident, 1: accident

# numeric-only view + a safe sample for heavy plots
num_df = df.select_dtypes(include=np.number).copy()
samp   = num_df.sample(min(len(num_df), 200_000), random_state=42)  # keeps things snappy

# consistent look
sns.set_theme(style="whitegrid")
palette = {"No Accident":"#2E86C1", "Accident":"#E74C3C"}

Line Plot - Time Series¶

In [19]:
plt.figure(figsize=(9,4))
df["gps_speed"].iloc[:20_000].plot(color="#2E86C1")
plt.title("Line Plot – Vehicle Speed Over Time (PVS 1)")
plt.xlabel("Time index"); plt.ylabel("Speed (km/h)")
plt.tight_layout()
plt.savefig(EDA / "A_line_speed.png", dpi=200); plt.show()
No description has been provided for this image

Vertical Bar Plot - Class Counts¶

In [20]:
plt.figure(figsize=(4.5,4))
(df[LABEL]
 .map({0:"No Accident",1:"Accident"})
 .value_counts().reindex(["No Accident","Accident"])
 .plot(kind="bar", color=[palette["No Accident"], palette["Accident"]], edgecolor="black"))
plt.title("Bar Plot – Class Distribution")
plt.xlabel("Class"); plt.ylabel("Count"); plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig(EDA / "B_bar_vertical_class.png", dpi=200); plt.show()
No description has been provided for this image

Horizontal Bar Plot - feature means¶

In [33]:
import numpy as np

# create derived magnitudes if not already in your df
if "acc_mag" not in df.columns:
    df["acc_mag"] = np.sqrt(
        df.filter(like="gps_acc_").pow(2).sum(axis=1)
    )

if "gyro_mag" not in df.columns:
    df["gyro_mag"] = np.sqrt(
        df.filter(like="gps_gyro_").pow(2).sum(axis=1)
    )

# make a numeric-only copy for plotting
num_df = df.select_dtypes(include=np.number)
In [34]:
means = num_df[["gps_speed","acc_mag","gyro_mag","speed_drop"]].mean().sort_values()
plt.figure(figsize=(6,4))
means.plot(kind="barh", color="#5DADE2", edgecolor="black")
plt.title("Horizontal Bar Plot – Mean of Key Sensors")
plt.xlabel("Mean value"); plt.ylabel("Feature")
plt.tight_layout(); plt.show()
No description has been provided for this image

Histogram - with KDE overlay¶

In [36]:
features = ["gps_speed", "acc_mag", "gyro_mag", "speed_drop"]

for col in features:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col].dropna(), bins=50, kde=True, color="#5DADE2", edgecolor="black")
    plt.title(f"Histogram + KDE – {col}")
    plt.xlabel(col); plt.ylabel("Frequency")
    plt.tight_layout(); plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Pie Chart - Class Proportion¶

In [23]:
plt.figure(figsize=(4.6,4.6))
(df[LABEL].map({0:"No Accident",1:"Accident"})
 .value_counts().reindex(["No Accident","Accident"])
 .plot.pie(autopct="%.1f%%", colors=[palette["No Accident"], palette["Accident"]],
           startangle=90, wedgeprops={"edgecolor":"white"}))
plt.title("Pie – Accident vs Non-Accident")
plt.ylabel("")
plt.tight_layout()
plt.savefig(EDA / "E_pie_class.png", dpi=200); plt.show()
No description has been provided for this image

KDE - Density Plot¶

In [24]:
plt.figure(figsize=(6,4))
sns.kdeplot(data=num_df, x="gps_speed", fill=True, alpha=0.35, color="#2E86C1")
plt.title("KDE – Speed Distribution")
plt.xlabel("Speed (km/h)"); plt.ylabel("Density")
plt.tight_layout()
plt.savefig(EDA / "F_kde_speed.png", dpi=200); plt.show()
No description has been provided for this image

Area Plot - Cumulative mean¶

In [25]:
plt.figure(figsize=(7,3.8))
num_df["gps_speed"].dropna().rolling(1500).mean().clip(lower=0).plot(kind="area", color="#AED6F1")
plt.title("Area – Rolling Mean Speed (Window=1500)")
plt.xlabel("Time index"); plt.ylabel("Mean speed (km/h)")
plt.tight_layout()
plt.savefig(EDA / "G_area_rolling_mean_speed.png", dpi=200); plt.show()
No description has been provided for this image

Box and Whisket Plot¶

In [37]:
import matplotlib.pyplot as plt
import seaborn as sns

# Choose features you want to visualize
features = ["gps_speed", "acc_mag", "gyro_mag", "speed_drop"]

plt.figure(figsize=(8, 5))
sns.boxplot(data=df[features], orient="h", palette="pastel")
plt.title("Box and Whisker Plot – Feature Distributions")
plt.xlabel("Value")
plt.ylabel("Sensor Features")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [31]:
# Boxplots: y=0 vs y=1 for each chosen feature
import math
r = math.ceil(len(top)/2)
fig, axes = plt.subplots(r, 2, figsize=(12, 4*r))
axes = axes.flatten()
for i, c in enumerate(top):
    try:
        W.boxplot(column=c, by="y", ax=axes[i])
        axes[i].set_title(c); axes[i].set_xlabel("y"); axes[i].set_ylabel(c)
    except Exception as e:
        axes[i].set_visible(False)
plt.suptitle("Feature distributions by class (windows)"); plt.tight_layout(); plt.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[31], line 3
      1 # Boxplots: y=0 vs y=1 for each chosen feature
      2 import math
----> 3 r = math.ceil(len(top)/2)
      4 fig, axes = plt.subplots(r, 2, figsize=(12, 4*r))
      5 axes = axes.flatten()

NameError: name 'top' is not defined

Scatter Plot - Relationship¶

In [38]:
plt.figure(figsize=(6, 4))
sns.scatterplot(
    data=df,
    x="gps_speed",
    y="acc_mag",
    hue="proxy_incident",
    palette={0: "#5DADE2", 1: "#E74C3C"},
    alpha=0.6,
    s=15
)
plt.title("Scatter Plot – Speed vs Acceleration Magnitude")
plt.xlabel("Speed (km/h)")
plt.ylabel("Acceleration (m/s²)")
plt.legend(title="Class (0=Normal, 1=Accident)")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [39]:
plt.figure(figsize=(6, 4))
sns.scatterplot(
    data=df,
    x="gps_speed",
    y="gyro_mag",
    hue="proxy_incident",
    palette={0: "#5DADE2", 1: "#E74C3C"},
    alpha=0.6,
    s=15
)
plt.title("Scatter Plot – Speed vs Gyroscope Magnitude")
plt.xlabel("Speed (km/h)")
plt.ylabel("Gyroscope (°/s)")
plt.legend(title="Class (0=Normal, 1=Accident)")
plt.tight_layout()
plt.show()
No description has been provided for this image

Hexbin Plot - Dense Scatter Alternative¶

In [42]:
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns

# where to save figures (adjust if you like)
ART = r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports"
os.makedirs(ART, exist_ok=True)

# df must already be loaded; if not, uncomment and point to your file:
# df = pd.read_parquet(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\clean_resampled\PVS 1.parquet")

# ---- derived features expected by the plots ----
def pick(df, *candidates):
    """Return first column that exists, else None."""
    for c in candidates:
        if c in df.columns: return c
    return None

# acceleration magnitude (prefer '..._dashboard' axes)
ax = pick(df, "gps_acc_x_dashboard", "gps_acc_x_above_suspension", "gps_acc_x_below_suspension")
ay = pick(df, "gps_acc_y_dashboard", "gps_acc_y_above_suspension", "gps_acc_y_below_suspension")
az = pick(df, "gps_acc_z_dashboard", "gps_acc_z_above_suspension", "gps_acc_z_below_suspension")
if all([ax, ay, az]):
    df["acc_mag"] = np.sqrt(df[ax]**2 + df[ay]**2 + df[az]**2)

# gyroscope magnitude (prefer '..._dashboard' axes)
gx = pick(df, "gps_gyro_x_dashboard", "gps_gyro_x_above_suspension", "gps_gyro_x_below_suspension")
gy = pick(df, "gps_gyro_y_dashboard", "gps_gyro_y_above_suspension", "gps_gyro_y_below_suspension")
gz = pick(df, "gps_gyro_z_dashboard", "gps_gyro_z_above_suspension", "gps_gyro_z_below_suspension")
if all([gx, gy, gz]):
    df["gyro_mag"] = np.sqrt(df[gx]**2 + df[gy]**2 + df[gz]**2)

# speed_drop (positive when speed decreases)
if "gps_speed" in df.columns:
    df["speed_drop"] = (-df["gps_speed"].diff()).clip(lower=0).fillna(0)

# label fallback if needed
label_col = "proxy_incident" if "proxy_incident" in df.columns else None
if label_col is None:
    df["proxy_incident"] = 0
    label_col = "proxy_incident"

# analysis subset used by the plots
want = [c for c in ["gps_speed","acc_mag","gyro_mag","speed_drop", label_col] if c in df.columns]
seg = df[want].dropna()
print("seg shape:", seg.shape, "cols:", list(seg.columns))
seg shape: (72019, 5) cols: ['gps_speed', 'acc_mag', 'gyro_mag', 'speed_drop', 'proxy_incident']
In [43]:
if {"gps_speed", "gyro_mag"}.issubset(seg.columns):
    plt.figure(figsize=(6.2,4.5))
    plt.hexbin(seg["gps_speed"], seg["gyro_mag"], gridsize=50, cmap="viridis", mincnt=5)
    cb = plt.colorbar(); cb.set_label("Counts")
    plt.title("Hexbin: GPS speed vs Gyro magnitude")
    plt.xlabel("gps_speed"); plt.ylabel("gyro_mag")
    plt.tight_layout()
    plt.savefig(os.path.join(ART, "hexbin_speed_gyro.png"), dpi=200)
    plt.show()
else:
    print("Hexbin skipped: need both 'gps_speed' and 'gyro_mag'. Present:", list(seg.columns))
No description has been provided for this image

Correlation Matrix - Heat Map¶

In [44]:
# keep numeric columns only (avoids "could not convert string to float: 'PVS 1'")
num_for_corr = seg.select_dtypes(include=[np.number])
corr = num_for_corr.corr().clip(-1, 1)

plt.figure(figsize=(6.5,5.2))
sns.heatmap(corr, cmap="coolwarm", vmin=-1, vmax=1, cbar_kws={"shrink": .85})
plt.title("Correlation Heatmap of Numeric Features")
plt.tight_layout()
plt.savefig(os.path.join(ART, "heatmap_corr_numeric.png"), dpi=200)
plt.show()
No description has been provided for this image

Assignment 4 - Preprocessing¶

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 5)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

DATA_PATH = Path("C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset") 

LABEL_COL = "label"
  Cell In[2], line 30
    DATA_PATH = Path("C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
                     ^
SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape
In [3]:
from pathlib import Path
import pandas as pd
import glob

WINDOWS_PATH = Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\windows")

files = glob.glob(str(WINDOWS_PATH / "*.parquet"))

df_list = [pd.read_parquet(f) for f in files]
df_final = pd.concat(df_list, ignore_index=True)

# Save final combined dataset
OUTPUT_FILE = Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\pvs_final.csv")
df_final.to_csv(OUTPUT_FILE, index=False)

print("Final dataset saved as:", OUTPUT_FILE)
print("Shape:", df_final.shape)
df_final.head()
Final dataset saved as: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\pvs_final.csv
Shape: (7193, 169)
Out[3]:
session t_start t_end y gps_acc_x_dashboard_mean gps_acc_x_dashboard_std gps_acc_x_dashboard_min gps_acc_x_dashboard_max gps_acc_x_dashboard_rms gps_acc_y_dashboard_mean ... speed_drop_mean speed_drop_std speed_drop_min speed_drop_max speed_drop_rms proxy_incident_mean proxy_incident_std proxy_incident_min proxy_incident_max proxy_incident_rms
0 PVS 1 2019-12-24 20:19:56.540 2019-12-24 20:19:59.520 0 0.306168 0.016296 0.270605 0.365176 0.306598 0.165220 ... 0.000099 0.000143 0.0 0.000303 0.000173 0.0 0.0 0.0 0.0 0.0
1 PVS 1 2019-12-24 20:19:58.040 2019-12-24 20:20:01.020 0 0.307486 0.026242 0.223918 0.374752 0.308596 0.163978 ... 0.000201 0.000913 0.0 0.007933 0.000932 0.0 0.0 0.0 0.0 0.0
2 PVS 1 2019-12-24 20:19:59.540 2019-12-24 20:20:02.520 0 0.306282 0.030071 0.195187 0.385526 0.307745 0.163780 ... 0.002592 0.003733 0.0 0.007933 0.004534 0.0 0.0 0.0 0.0 0.0
3 PVS 1 2019-12-24 20:20:01.040 2019-12-24 20:20:04.020 0 0.307264 0.024270 0.195187 0.385526 0.308215 0.164431 ... 0.002487 0.003691 0.0 0.007933 0.004441 0.0 0.0 0.0 0.0 0.0
4 PVS 1 2019-12-24 20:20:02.540 2019-12-24 20:20:05.520 0 0.306369 0.018382 0.263422 0.367570 0.306916 0.163311 ... 0.000186 0.000305 0.0 0.000827 0.000357 0.0 0.0 0.0 0.0 0.0

5 rows × 169 columns

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 5)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

DATA_PATH = Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\pvs_final.csv")
LABEL_COL = "label" 

Loading the Final Dataset¶

In [5]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA_PATH = Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\pvs_final.csv")
df = pd.read_csv(DATA_PATH)

print(df.shape)
df.head()
(7193, 169)
Out[5]:
session t_start t_end y gps_acc_x_dashboard_mean gps_acc_x_dashboard_std gps_acc_x_dashboard_min gps_acc_x_dashboard_max gps_acc_x_dashboard_rms gps_acc_y_dashboard_mean ... speed_drop_mean speed_drop_std speed_drop_min speed_drop_max speed_drop_rms proxy_incident_mean proxy_incident_std proxy_incident_min proxy_incident_max proxy_incident_rms
0 PVS 1 2019-12-24 20:19:56.540 2019-12-24 20:19:59.520 0 0.306168 0.016296 0.270605 0.365176 0.306598 0.165220 ... 0.000099 0.000143 0.0 0.000303 0.000173 0.0 0.0 0.0 0.0 0.0
1 PVS 1 2019-12-24 20:19:58.040 2019-12-24 20:20:01.020 0 0.307486 0.026242 0.223918 0.374752 0.308596 0.163978 ... 0.000201 0.000913 0.0 0.007933 0.000932 0.0 0.0 0.0 0.0 0.0
2 PVS 1 2019-12-24 20:19:59.540 2019-12-24 20:20:02.520 0 0.306282 0.030071 0.195187 0.385526 0.307745 0.163780 ... 0.002592 0.003733 0.0 0.007933 0.004534 0.0 0.0 0.0 0.0 0.0
3 PVS 1 2019-12-24 20:20:01.040 2019-12-24 20:20:04.020 0 0.307264 0.024270 0.195187 0.385526 0.308215 0.164431 ... 0.002487 0.003691 0.0 0.007933 0.004441 0.0 0.0 0.0 0.0 0.0
4 PVS 1 2019-12-24 20:20:02.540 2019-12-24 20:20:05.520 0 0.306369 0.018382 0.263422 0.367570 0.306916 0.163311 ... 0.000186 0.000305 0.0 0.000827 0.000357 0.0 0.0 0.0 0.0 0.0

5 rows × 169 columns

Identifying and Handle Missing Values¶

In [19]:
df_out = df.copy()

missing = df_out.isna().sum()
print("Columns with missing values:")
print(missing[missing > 0])

numeric_cols_all = df_out.select_dtypes(include=[np.number]).columns
df_out[numeric_cols_all] = df_out[numeric_cols_all].fillna(df_out[numeric_cols_all].median())

print("\nTotal remaining NaNs:", df_out.isna().sum().sum())
Columns with missing values:
Series([], dtype: int64)

Total remaining NaNs: 0
In [6]:
missing = df.isna().sum()
print(missing[missing > 0])
Series([], dtype: int64)
In [7]:
numeric_cols = df.select_dtypes(include=[np.number]).columns

df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

Outliner Detection & Treatment¶

In [8]:
df_out = df.copy()

for col in numeric_cols:
    lower = df_out[col].quantile(0.001)
    upper = df_out[col].quantile(0.999)
    df_out[col] = df_out[col].clip(lower, upper)
In [20]:
sample_features = [
    c for c in df_out.select_dtypes(include=[np.number]).columns 
    if c not in ["y"]
][:8] 
plt.figure(figsize=(10, 5))
sns.boxplot(data=df_out[sample_features])
plt.title("Boxplot of Sample Numeric Features")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

for col in numeric_cols_all:
    lower = df_out[col].quantile(0.001)
    upper = df_out[col].quantile(0.999)
    df_out[col] = df_out[col].clip(lower, upper)

print("Outlier clipping done at 0.1% tails.")
No description has been provided for this image
Outlier clipping done at 0.1% tails.

Discretization¶

In [10]:
df_out = df_out.copy()

df_out["speed_bin"] = pd.cut(
    df_out["gps_speed_mean"],
    bins=[-1, 20, 40, 60, 200],
    labels=["0–20", "20–40", "40–60", ">60"]
)
In [21]:
if "gps_speed_mean" in df_out.columns:
    df_out = df_out.copy()  # defragment
    df_out["speed_bin"] = pd.cut(
        df_out["gps_speed_mean"],
        bins=[-1, 20, 40, 60, 200],
        labels=["0–20", "20–40", "40–60", ">60"]
    )
    print(df_out["speed_bin"].value_counts())
    sns.countplot(x="speed_bin", data=df_out)
    plt.title("Speed Bin Distribution")
    plt.show()
else:
    print("gps_speed_mean not found; skipping discretization.")
speed_bin
0–20     5930
20–40    1263
40–60       0
>60         0
Name: count, dtype: int64
No description has been provided for this image

Scaling and Normalization¶

In [18]:
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

numeric_cols = df_out.select_dtypes(include=[np.number]).columns

X = df_out[numeric_cols].drop(columns=["y"])
y = df_out["y"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

std_scaler = StandardScaler()
mm_scaler = MinMaxScaler()

X_train_std = std_scaler.fit_transform(X_train)
X_test_std  = std_scaler.transform(X_test)

X_train_mm = mm_scaler.fit_transform(X_train)
X_test_mm  = mm_scaler.transform(X_test)

print("X_train_std shape:", X_train_std.shape)
print("X_train_mm shape:", X_train_mm.shape)
X_train_std shape: (5754, 165)
X_train_mm shape: (5754, 165)
In [15]:
drop_cols = ["session", "t_start", "t_end"] 

X = df_out.drop(columns=drop_cols + ["y"])
y = df_out["y"]

Train/Test Split¶

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

std_scaler = StandardScaler()
mm_scaler = MinMaxScaler()

X_train_std = std_scaler.fit_transform(X_train)
X_test_std  = std_scaler.transform(X_test)

X_train_mm = mm_scaler.fit_transform(X_train)
X_test_mm  = mm_scaler.transform(X_test)

print("X_train_std shape:", X_train_std.shape)
print("X_train_mm shape:", X_train_mm.shape)
Train shape: (5754, 165) Test shape: (1439, 165)
X_train_std shape: (5754, 165)
X_train_mm shape: (5754, 165)

Select Numeric Features, Define X and Y¶

In [23]:
LABEL_COL = "y"

numeric_cols = df_out.select_dtypes(include=[np.number]).columns

print("Is label in numeric_cols?", LABEL_COL in numeric_cols)

X = df_out[numeric_cols].drop(columns=[LABEL_COL])
y = df_out[LABEL_COL]

print("X shape:", X.shape)
print("y shape:", y.shape)
Is label in numeric_cols? True
X shape: (7193, 165)
y shape: (7193,)

Data Integration¶

In [35]:
print("Integrated dataset shape:", df.shape)
df.head()
Integrated dataset shape: (7193, 169)
Out[35]:
session t_start t_end y gps_acc_x_dashboard_mean gps_acc_x_dashboard_std gps_acc_x_dashboard_min gps_acc_x_dashboard_max gps_acc_x_dashboard_rms gps_acc_y_dashboard_mean ... speed_drop_mean speed_drop_std speed_drop_min speed_drop_max speed_drop_rms proxy_incident_mean proxy_incident_std proxy_incident_min proxy_incident_max proxy_incident_rms
0 PVS 1 2019-12-24 20:19:56.540 2019-12-24 20:19:59.520 0 0.306168 0.016296 0.270605 0.365176 0.306598 0.165220 ... 0.000099 0.000143 0.0 0.000303 0.000173 0.0 0.0 0.0 0.0 0.0
1 PVS 1 2019-12-24 20:19:58.040 2019-12-24 20:20:01.020 0 0.307486 0.026242 0.223918 0.374752 0.308596 0.163978 ... 0.000201 0.000913 0.0 0.007933 0.000932 0.0 0.0 0.0 0.0 0.0
2 PVS 1 2019-12-24 20:19:59.540 2019-12-24 20:20:02.520 0 0.306282 0.030071 0.195187 0.385526 0.307745 0.163780 ... 0.002592 0.003733 0.0 0.007933 0.004534 0.0 0.0 0.0 0.0 0.0
3 PVS 1 2019-12-24 20:20:01.040 2019-12-24 20:20:04.020 0 0.307264 0.024270 0.195187 0.385526 0.308215 0.164431 ... 0.002487 0.003691 0.0 0.007933 0.004441 0.0 0.0 0.0 0.0 0.0
4 PVS 1 2019-12-24 20:20:02.540 2019-12-24 20:20:05.520 0 0.306369 0.018382 0.263422 0.367570 0.306916 0.163311 ... 0.000186 0.000305 0.0 0.000827 0.000357 0.0 0.0 0.0 0.0 0.0

5 rows × 169 columns

Feature Selection¶

In [36]:
corr = df[numeric_cols].corr()
plt.figure(figsize=(10,7))
sns.heatmap(corr, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

from sklearn.ensemble import RandomForestClassifier

rf_temp = RandomForestClassifier(n_estimators=200)
rf_temp.fit(X_mm, y)

importances = rf_temp.feature_importances_
feature_importance_df = pd.DataFrame({
    "Feature": numeric_cols,
    "Importance": importances
}).sort_values("Importance", ascending=False)

feature_importance_df.head(15)
No description has been provided for this image
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[36], line 10
      7 from sklearn.ensemble import RandomForestClassifier
      9 rf_temp = RandomForestClassifier(n_estimators=200)
---> 10 rf_temp.fit(X_mm, y)
     12 importances = rf_temp.feature_importances_
     13 feature_importance_df = pd.DataFrame({
     14     "Feature": numeric_cols,
     15     "Importance": importances
     16 }).sort_values("Importance", ascending=False)

NameError: name 'X_mm' is not defined
In [39]:
from sklearn.ensemble import RandomForestClassifier

# Make sure df_out exists (it should be your cleaned, clipped DataFrame)
# If not, you can set: df_out = df.copy()

# 1) Select numeric columns and drop label from features
LABEL_COL = "y"
numeric_cols = df_out.select_dtypes(include=[np.number]).columns
feature_cols = [c for c in numeric_cols if c != LABEL_COL]

X_fs = df_out[feature_cols]
y_fs = df_out[LABEL_COL]

print("Feature matrix for selection:", X_fs.shape)

# 2) Train a temporary Random Forest for feature importance
rf_temp = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced"
)
rf_temp.fit(X_fs, y_fs)

# 3) Build importance table
importances = rf_temp.feature_importances_

feature_importance_df = pd.DataFrame({
    "Feature": feature_cols,
    "Importance": importances
}).sort_values("Importance", ascending=False)

# Show top 15
feature_importance_df.head(15)
Feature matrix for selection: (7193, 165)
Out[39]:
Feature Importance
156 speed_drop_std 0.118236
161 proxy_incident_std 0.096313
163 proxy_incident_max 0.094612
158 speed_drop_max 0.090670
164 proxy_incident_rms 0.070909
160 proxy_incident_mean 0.052569
159 speed_drop_rms 0.042564
155 speed_drop_mean 0.040207
132 gps_temp_below_suspension_min 0.031928
151 gps_speed_std 0.027612
99 gps_mag_y_dashboard_rms 0.024630
95 gps_mag_y_dashboard_mean 0.020392
130 gps_temp_below_suspension_mean 0.018697
148 gps_longitude_max 0.017120
82 gps_gyro_y_below_suspension_min 0.016466
In [40]:
top_n = 15
plt.figure(figsize=(10,6))
sns.barplot(
    data=feature_importance_df.head(top_n),
    x="Importance", y="Feature"
)
plt.title("Top Feature Importances (Random Forest)")
plt.tight_layout()
plt.show()
No description has been provided for this image

Feature Extraction¶

In [42]:
import numpy as np

df_feat = df_out.copy()

acc_cols = [c for c in df_feat.columns if "acc" in c.lower()]
gyro_cols = [c for c in df_feat.columns if "gyro" in c.lower()]

print("Number of accelerometer-related features:", len(acc_cols))
print("Number of gyroscope-related features:", len(gyro_cols))

if acc_cols:
    df_feat["acc_features_mean"] = df_feat[acc_cols].mean(axis=1)
    df_feat["acc_features_max"]  = df_feat[acc_cols].max(axis=1)
    df_feat["acc_features_std"]  = df_feat[acc_cols].std(axis=1)
else:
    print("No accelerometer-related columns found for aggregation.")

if gyro_cols:
    df_feat["gyro_features_mean"] = df_feat[gyro_cols].mean(axis=1)
    df_feat["gyro_features_max"]  = df_feat[gyro_cols].max(axis=1)
    df_feat["gyro_features_std"]  = df_feat[gyro_cols].std(axis=1)
else:
    print("No gyroscope-related columns found for aggregation.")

print("New feature columns added:")
new_cols = [c for c in df_feat.columns if "features_" in c]
print(new_cols)

df_feat[new_cols].head()
Number of accelerometer-related features: 45
Number of gyroscope-related features: 45
New feature columns added:
['acc_features_mean', 'acc_features_max', 'acc_features_std', 'gyro_features_mean', 'gyro_features_max', 'gyro_features_std']
Out[42]:
acc_features_mean acc_features_max acc_features_std gyro_features_mean gyro_features_max gyro_features_std
0 2.780080 10.004245 4.324094 0.156156 3.960586 0.617627
1 2.775642 10.004245 4.328600 0.144142 3.884292 0.615608
2 2.775775 10.112569 4.331216 0.033942 0.789833 0.292524
3 2.774114 10.112569 4.331920 0.031751 0.690651 0.278834
4 2.771142 10.004843 4.329112 0.101573 2.633072 0.428303

Assignment 5 - Modeling & Evaluation¶

Evaluation helper function¶

In [24]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, 
    f1_score, roc_auc_score, confusion_matrix, classification_report
)

def evaluate_model(name, y_true, y_pred, y_proba):
    print(f"\n===== {name} =====")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, zero_division=0))
    print("Recall   :", recall_score(y_true, y_pred, zero_division=0))
    print("F1 Score :", f1_score(y_true, y_pred, zero_division=0))
    print("ROC AUC  :", roc_auc_score(y_true, y_proba))

    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, zero_division=0))

    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, cmap='Blues', fmt='d')
    plt.title(f"Confusion Matrix – {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

Logistic Regression - Baseline Model¶

In [25]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    class_weight='balanced',
    max_iter=2000,
    random_state=RANDOM_STATE
)

lr.fit(X_train_std, y_train)

y_pred_lr = lr.predict(X_test_std)
y_proba_lr = lr.predict_proba(X_test_std)[:, 1]

evaluate_model("Logistic Regression", y_test, y_pred_lr, y_proba_lr)
===== Logistic Regression =====
Accuracy : 1.0
Precision: 1.0
Recall   : 1.0
F1 Score : 1.0
ROC AUC  : 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1437
           1       1.00      1.00      1.00         2

    accuracy                           1.00      1439
   macro avg       1.00      1.00      1.00      1439
weighted avg       1.00      1.00      1.00      1439

No description has been provided for this image

Random Foreset Classifier (Tree-Based)¶

In [26]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_split=2,
    class_weight='balanced',
    n_jobs=-1,
    random_state=RANDOM_STATE
)

rf.fit(X_train_mm, y_train)

y_pred_rf = rf.predict(X_test_mm)
y_proba_rf = rf.predict_proba(X_test_mm)[:, 1]

evaluate_model("Random Forest", y_test, y_pred_rf, y_proba_rf)
===== Random Forest =====
Accuracy : 1.0
Precision: 1.0
Recall   : 1.0
F1 Score : 1.0
ROC AUC  : 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1437
           1       1.00      1.00      1.00         2

    accuracy                           1.00      1439
   macro avg       1.00      1.00      1.00      1439
weighted avg       1.00      1.00      1.00      1439

No description has been provided for this image

Gradient Boosting Classifier¶

In [27]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    n_estimators=250,
    learning_rate=0.1,
    max_depth=3,
    random_state=RANDOM_STATE
)

gb.fit(X_train_mm, y_train)

y_pred_gb = gb.predict(X_test_mm)
y_proba_gb = gb.predict_proba(X_test_mm)[:, 1]

evaluate_model("Gradient Boosting", y_test, y_pred_gb, y_proba_gb)
===== Gradient Boosting =====
Accuracy : 1.0
Precision: 1.0
Recall   : 1.0
F1 Score : 1.0
ROC AUC  : 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1437
           1       1.00      1.00      1.00         2

    accuracy                           1.00      1439
   macro avg       1.00      1.00      1.00      1439
weighted avg       1.00      1.00      1.00      1439

No description has been provided for this image

SVM with RBF Kernel¶

In [28]:
from sklearn.svm import SVC

svm = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
    class_weight='balanced',
    probability=True,
    random_state=RANDOM_STATE
)

svm.fit(X_train_std, y_train)

y_pred_svm = svm.predict(X_test_std)
y_proba_svm = svm.predict_proba(X_test_std)[:, 1]

evaluate_model("SVM (RBF)", y_test, y_pred_svm, y_proba_svm)
===== SVM (RBF) =====
Accuracy : 1.0
Precision: 1.0
Recall   : 1.0
F1 Score : 1.0
ROC AUC  : 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1437
           1       1.00      1.00      1.00         2

    accuracy                           1.00      1439
   macro avg       1.00      1.00      1.00      1439
weighted avg       1.00      1.00      1.00      1439

No description has been provided for this image

Comparison Table - Accuracy, Precision, Recall, F1, ROC-AUC¶

In [29]:
def get_scores(y_true, y_pred, y_proba):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, zero_division=0),
        "Recall": recall_score(y_true, y_pred, zero_division=0),
        "F1": f1_score(y_true, y_pred, zero_division=0),
        "ROC_AUC": roc_auc_score(y_true, y_proba)
    }

results = []
results.append(("Logistic Regression", get_scores(y_test, y_pred_lr, y_proba_lr)))
results.append(("Random Forest", get_scores(y_test, y_pred_rf, y_proba_rf)))
results.append(("Gradient Boosting", get_scores(y_test, y_pred_gb, y_proba_gb)))
results.append(("SVM (RBF)", get_scores(y_test, y_pred_svm, y_proba_svm)))

rows = []
for name, scores in results:
    row = {"Model": name}
    row.update(scores)
    rows.append(row)

results_df = pd.DataFrame(rows)
display(results_df.sort_values(by="F1", ascending=False))
Model Accuracy Precision Recall F1 ROC_AUC
0 Logistic Regression 1.0 1.0 1.0 1.0 1.0
1 Random Forest 1.0 1.0 1.0 1.0 1.0
2 Gradient Boosting 1.0 1.0 1.0 1.0 1.0
3 SVM (RBF) 1.0 1.0 1.0 1.0 1.0

Error Analysis - False Positives & False Negatives¶

In [30]:
best_pred = y_pred_rf   # change if another model is best

false_negatives_idx = np.where((y_test == 1) & (best_pred == 0))[0]
false_positives_idx = np.where((y_test == 0) & (best_pred == 1))[0]

print("False Negatives (missed accidents):", len(false_negatives_idx))
print("False Positives (false alarms):", len(false_positives_idx))
False Negatives (missed accidents): 0
False Positives (false alarms): 0

Model Evaluation Function¶

In [31]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)

def evaluate_model(name, y_test, y_pred, y_proba):
    print(f"\n===== {name} =====")
    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, zero_division=0))
    print("Recall   :", recall_score(y_test, y_pred, zero_division=0))
    print("F1 Score :", f1_score(y_test, y_pred, zero_division=0))
    print("ROC AUC  :", roc_auc_score(y_test, y_proba))

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))

    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, cmap='Blues', fmt='d')
    plt.title(f"Confusion Matrix – {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()
In [32]:
evaluate_model("Logistic Regression", y_test, y_pred_lr, y_proba_lr)
evaluate_model("Random Forest", y_test, y_pred_rf, y_proba_rf)
evaluate_model("Gradient Boosting", y_test, y_pred_gb, y_proba_gb)
evaluate_model("SVM (RBF)", y_test, y_pred_svm, y_proba_svm)
===== Logistic Regression =====
Accuracy : 1.0
Precision: 1.0
Recall   : 1.0
F1 Score : 1.0
ROC AUC  : 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1437
           1       1.00      1.00      1.00         2

    accuracy                           1.00      1439
   macro avg       1.00      1.00      1.00      1439
weighted avg       1.00      1.00      1.00      1439

No description has been provided for this image
===== Random Forest =====
Accuracy : 1.0
Precision: 1.0
Recall   : 1.0
F1 Score : 1.0
ROC AUC  : 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1437
           1       1.00      1.00      1.00         2

    accuracy                           1.00      1439
   macro avg       1.00      1.00      1.00      1439
weighted avg       1.00      1.00      1.00      1439

No description has been provided for this image
===== Gradient Boosting =====
Accuracy : 1.0
Precision: 1.0
Recall   : 1.0
F1 Score : 1.0
ROC AUC  : 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1437
           1       1.00      1.00      1.00         2

    accuracy                           1.00      1439
   macro avg       1.00      1.00      1.00      1439
weighted avg       1.00      1.00      1.00      1439

No description has been provided for this image
===== SVM (RBF) =====
Accuracy : 1.0
Precision: 1.0
Recall   : 1.0
F1 Score : 1.0
ROC AUC  : 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1437
           1       1.00      1.00      1.00         2

    accuracy                           1.00      1439
   macro avg       1.00      1.00      1.00      1439
weighted avg       1.00      1.00      1.00      1439

No description has been provided for this image

Ranking Models¶

In [33]:
model_scores = {
    "Logistic Regression": f1_score(y_test, y_pred_lr),
    "Random Forest": f1_score(y_test, y_pred_rf),
    "Gradient Boosting": f1_score(y_test, y_pred_gb),
    "SVM": f1_score(y_test, y_pred_svm)
}

pd.DataFrame(model_scores.items(), columns=["Model", "F1 Score"]).sort_values("F1 Score", ascending=False)
Out[33]:
Model F1 Score
0 Logistic Regression 1.0
1 Random Forest 1.0
2 Gradient Boosting 1.0
3 SVM 1.0

Hyperparameter Tuning¶

In [34]:
param_grid_rf = {
    "n_estimators": [200, 400],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

rf_tuned = GridSearchCV(
    estimator=RandomForestClassifier(
        class_weight="balanced",
        random_state=RANDOM_STATE,
        n_jobs=-1
    ),
    param_grid=param_grid_rf,
    scoring="f1",    # focus on F1 for accident class
    cv=cv,
    n_jobs=-1,
    verbose=1
)

rf_tuned.fit(X_train_mm, y_train)
print("Best parameters:", rf_tuned.best_params_)
print("Best CV F1-score:", rf_tuned.best_score_)

best_rf = rf_tuned.best_estimator_
y_pred_rf_best = best_rf.predict(X_test_mm)
y_proba_rf_best = best_rf.predict_proba(X_test_mm)[:, 1]

eval_model("Random Forest (Tuned)", y_test, y_pred_rf_best, y_proba_rf_best)
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Best CV F1-score: 0.9333333333333332
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[34], line 30
     27 y_pred_rf_best = best_rf.predict(X_test_mm)
     28 y_proba_rf_best = best_rf.predict_proba(X_test_mm)[:, 1]
---> 30 eval_model("Random Forest (Tuned)", y_test, y_pred_rf_best, y_proba_rf_best)

NameError: name 'eval_model' is not defined